-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathproduce_output.py
182 lines (158 loc) · 8.39 KB
/
produce_output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import os
import copy
import sys
import logging
import numpy as np
import pandas as pd
import subprocess
from root_numpy import array2root
import ROOT
from NeuralNet import HyperModel
from import_tree import Tree2Pandas
from signal_coupling import Decoupler, Recoupler
from parameterize_classifier import ParametrizeClassifier
import parameters
class ProduceOutput:
def __init__(self,model,list_model,is_signal=False,generator=False,is_class_param=False,list_inputs=None):
self.model = model # Model of the NeuralNet
self.is_signal = is_signal
self.is_class_param = is_class_param
self.list_model = list_model
self.list_inputs = list_inputs
self.generator = generator
self.generator_filepath = None
if self.list_inputs is None:
self.list_inputs = copy.deepcopy(parameters.inputs)
def OutputFromTraining(self,data,path_output,output_name=None):
"""
Get the output of the model from the test set
This is data separated from the training
If output_name is specified, the whole data will be written in 'output_name'.root
if not, the tags in the dataframe are used to split into different files with names 'tag'.root
"""
# If signal, decouple the data #
if self.is_signal:
decoupled_name = 'weight_HToZA'
list_to_decouple = parameters.outputs
self.list_inputs += ['mH_MEM','mA_MEM']
data = Decoupler(data,decoupled_name,list_to_decouple)
if self.is_class_param:
signal_name = '-log10(weight_HToZA)'
data = ParametrizeClassifier(data,name=signal_name)
self.list_inputs = ['-log10(weight_DY)','-log10(weight_TT)',signal_name,'mH_gen','mA_gen']
inputs = data[self.list_inputs].values if data is not None else None
#output = np.empty((inputs.shape[0],0))
output = None
columns = []
# Get Model Output #
for model in self.list_model:
if model not in ['DY','TT','HToZA','class_global','class_param','binary','ME']:
logging.critical('Wrong model type specified : %s must be either "DY", "TT", "HToZA", "class_global", "class_param", "binary" or "ME"')
sys.exit(1)
instance = HyperModel(self.model,model)
if model in ['DY','TT','HToZA','ME']:
out = np.power(10,-instance.HyperRestore(inputs,generator=self.generator,generator_filepath=self.generator_filepath))
columns.append('output_%s'%model)
else:
out = instance.HyperRestore(inputs,generator=self.generator,generator_filepath=self.generator_filepath)
if model in ['binary']:
columns.extend(['Prob_MEM_signal'])
if model in ['class_param','class_global']:
columns.extend(['Prob_MEM_DY','Prob_MEM_HToZA','Prob_MEM_TT'])
if output is None: # First element of loop
output = copy.deepcopy(out) # TODO : fix data_generator for last smaller batch
else: # append next elements
output = np.c_[output,out]
# From numpy output array to df #
output_df = pd.DataFrame(output,columns=columns,index=pd.RangeIndex(start=0,stop=output.shape[0]))
# Make full df #
full_df = pd.concat([data,output_df],axis=1)
# Unresolved issue in DataGenerator :
# Cannot use smaller batches than batch_size to truncate the last elements of array
# that do not fit in a last batch
# TODO : fix that
# In the meantime, also truncate the output array, otherwise will fill nan
full_df = full_df[:output.shape[0]]
# Check signal case , needs to recouple parameters #
if self.is_signal:
logging.info("Signal case : Recoupling of the data")
full_df = Recoupler(full_df,col_to_recouple = ['weight_HToZA','output_HToZA'],N=len(list_to_decouple))
# Get the unique tags as a list #
if output_name is None:
tag_list = list(full_df['tag'].unique())
# Loop over tags #
for tag in tag_list:
tag_df = full_df.loc[full_df['tag']==tag] # We select the rows corresponding to this tag
tag_df = tag_df.drop('tag',axis=1)
# From df to numpy array with dtype #
tag_output = tag_df.to_records(index=False,column_dtypes='float64')
tag_output.dtype.names = parameters.make_dtype(tag_output.dtype.names)# because ( ) and . are an issue for root_numpy
tag_output_name = os.path.join(path_output,tag+'.root')
# Save as root file #
array2root(tag_output,tag_output_name,mode='recreate')
logging.info('Output saved as : '+tag_output_name)
else:
# From df to numpy array with dtype #
full_output = full_df.to_records(index=False,column_dtypes='float64')
full_output.dtype.names = parameters.make_dtype(full_output.dtype.names)# because ( ) and . are an issue for root_numpy
full_output_name = os.path.join(path_output,output_name)
array2root(full_output,full_output_name,mode='recreate')
logging.info('Output saved as : '+full_output_name)
def OutputNewData(self,input_dir,list_sample,path_output,variables=None):
"""
Given a model, produce the output
The Network has never seen this data !
"""
# Loop over datasets #
logging.info('Input directory : %s'%input_dir)
for f in list_sample:
name = os.path.basename(f)
full_path = os.path.join(input_dir,f)
logging.info('Looking at %s'%f)
# Get the data #
if variables is None:
var = parameters.inputs+parameters.outputs+parameters.other_variables
else:
var = copy.deepcopy(variables) # Avoid bug where variables is changed at each new file
if self.generator:
# Get number of events in file #
rootfile = ROOT.TFile(full_path)
tree = rootfile.Get("tree")
N = tree.GetEntries()
rootfile.Close()
self.generator = False # Do not use the generator since we need the whole array
slices = list(range(0,N,parameters.output_batch_size))
if slices[-1] != N:
slices += [N]
limits = list(zip(slices[:-1], slices[1:]))
list_files = []
for idx,(start,stop) in enumerate(limits):
logging.debug("Processing tree from %d to %d (total = %d)"%(start,stop,N))
data = Tree2Pandas(input_file=full_path,
variables=var,
weight=parameters.weights,
cut = parameters.cut,
reweight_to_cross_section=False,
start=start,
n = stop)
split_file = name.replace('.root','_%d.root'%idx)
list_files.append(os.path.join(path_output,split_file))
self.OutputFromTraining(data=data,path_output=path_output,output_name=split_file)
# Concatenate the output and remove split #
cmd = ["hadd",os.path.join(path_output,name)]+list_files
subprocess.run(cmd)
logging.info("Produced concatenated file at %s"%(os.path.join(path_output,name)))
for split_file in list_files:
os.remove(split_file)
logging.debug("... Deleted split file %s"%split_file)
logging.info("Cleaning of split files done")
else:
data = Tree2Pandas(input_file=full_path,
variables=var,
weight=parameters.weights,
cut = parameters.cut,
reweight_to_cross_section=False)
if data.shape[0]==0:
logging.info('\tEmpty tree')
continue # Avoids empty trees
self.OutputFromTraining(data=data,path_output=path_output,output_name=name)