-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathimport_tree.py
148 lines (124 loc) · 5.24 KB
/
import_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import glob
import os
import sys
import logging
import re
import collections
import copy
import array
import numpy as np
import pandas as pd
import parameters
from root_numpy import tree2array, rec2array
from ROOT import TChain, TFile, TTree
###############################################################################
# Tree2Pandas#
###############################################################################
def Tree2Pandas(input_file, variables, weight=None, cut=None, reweight_to_cross_section=False, n=None, tree_name='tree',start=None):
"""
Convert a ROOT TTree to a numpy array.
"""
# Check for repetitions in variables -> makes root_numpy crash #
variables = copy.copy(variables) # Otherwise will add the weight and have a duplicate branch
rep = [item for item, count in collections.Counter(variables).items() if count > 1]
if len(rep) != 0:
for r in rep:
logging.critical('The argument "%s" is repeated in the variables'%r)
sys.exit(1)
file_handle = TFile.Open(input_file)
tree = file_handle.Get(tree_name)
N = tree.GetEntries()
logging.debug('\t\tNumber of events : '+str(N))
relative_weight = 1
if reweight_to_cross_section:
cross_section = file_handle.Get('cross_section').GetVal()
event_weight_sum = file_handle.Get("event_weight_sum").GetVal()
relative_weight = cross_section / event_weight_sum
logging.debug('\t\tReweighting requested')
logging.debug('\t\t\tCross section : '+str(cross_section))
logging.debug('\t\t\tEvent weight sum : '+str(event_weight_sum))
logging.debug('\t\t\tRelative weight : '+str(relative_weight))
# Read the tree and convert it to a numpy structured array
if weight is not None:
variables += [weight]
data = tree2array(tree, branches=variables, selection=cut, start=start, stop=n)
# Convert to pandas dataframe #
df = pd.DataFrame(data)
if weight is not None:
df[weight] *= relative_weight
# Only part of tree #
if n:
if n == -1:
n = N # Get all entries
if start:
if n < start:
logging.critical('Importing tree with start higher than end, will output empty tree')
logging.info("Reading from {} to {} in input tree".format(start,n))
else:
logging.info("Reading only {} from input tree".format(n))
file_handle.Close()
return df
###############################################################################
# LoopOverTrees #
###############################################################################
def LoopOverTrees(input_dir, variables, weight=None, tag=None, cut=None, reweight_to_cross_section=False, n=None, list_sample=None, start=None):
"""
Loop over ROOT trees inside input_dir and process them using Tree2Pandas.
"""
# Check if directory #
if not os.path.isdir(input_dir):
logging.critical("LoopOverTrees : Not a directory")
sys.exit(1)
logging.debug("Accessing directory : "+input_dir)
# Add potential cut to the one in parameters.py file #
if cut is not None:
cut += " && "+parameters.cut
else:
cut : parameters.cut
# Wether to use a given sample list or loop over files inside a dir #
if list_sample is None:
list_sample = glob.glob(os.path.join(input_dir,"*.root"))
else:
list_sample = [input_dir + s for s in list_sample]
# Loop over the files #
first_file = True
all_df = pd.DataFrame()
for name in list_sample:
filename = name.replace(input_dir,'')
logging.debug("\tAccessing file : %s"%filename)
# If a tag for specific files has been requested #
if tag is not None:
if re.search(tag,filename):
logging.debug('\t\t-> Matched sample')
else:
continue
# Get the data as pandas df #
df = Tree2Pandas(input_file = name,
variables = variables,
weight = weight,
cut = cut,
reweight_to_cross_section = reweight_to_cross_section,
n = n,
tree_name = 'tree',
start = start)
# Find mH, mA #
if filename.find('HToZA')!=-1: # Signal -> Search for mH and mA
mH = [int(re.findall(r'\d+', filename)[2])]*df.shape[0]
mA = [int(re.findall(r'\d+', filename)[3])]*df.shape[0]
else: # Background, set them at 0
mH = [0]*df.shape[0]
mA = [0]*df.shape[0]
# Register in DF #
df['mH_gen'] = pd.Series(mH)
df['mA_gen'] = pd.Series(mA)
# Register the tag if provided #
if tag is not None:
df['tag'] = pd.Series([tag]*df.shape[0])
# Concatenate into full df #
if first_file:
all_df = df
first_file = False
else:
all_df = pd.concat([all_df,df])
all_df = all_df.reset_index(drop=True) # Otherwise there will be an index repetition for each file
return all_df