-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathPrepFromQeMFi.py
113 lines (94 loc) · 3.44 KB
/
PrepFromQeMFi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 12 15:06:02 2024
@author: vvinod
"""
import numpy as np
from sklearn.utils import shuffle
def prep_data(prop='EV',rep='CM'):
'''
Function to extract desired data from the QeMFi dataset.
Parameters
----------
prop : str, optional
Property of interest to be extracted from QeMFi. The default is 'EV'.
rep : str, optional
The type of representation to be used. The default is 'CM'.
Returns
-------
X_train : np.ndarray
training reps.
X_val : np.ndarray
validaiton reps. used only for o-MFML.
X_test : np.ndarray
test reps.
y_train : np.ndarray
training energie across all fidelities.
y_val : np.ndarray
validaiton energies of TZVP.
y_test : np.ndarray
test energies of TZVP.
idx_names : np.ndarray
ID of which molecule is being chosen. 0-urea, 8-ohbdi.
'''
molnames = ['urea','acrolein','alanine','sma','nitrophenol',
'urocanic','dmabn','thymine','o-hbdi']
idx = np.arange(0,15000)
idx = shuffle(idx,random_state=42)
X = np.load(f'../QeMFi/MFML/Reps/o-hbdi_{rep}.npy')
#largest SLATM is for o-hbdi with 6438 features.
X=np.zeros((135000,X.shape[1]),dtype=float)
#Rest will be padded to this size.
y_all = np.zeros((135000,5),dtype=float)
start=0
end=15000
idx_names = np.zeros((135000),dtype=float)
for i,m in enumerate(molnames):
names = np.full(15000,i)
idx_names[start:end] = np.copy(names)
temp_X = np.load(f'../QeMFi/MFML/Reps/{m}_{rep}.npy')
X[start:end,:temp_X.shape[-1]] = temp_X[idx,:]
if prop=='EV':
temp_data=np.load(f'../QeMFi/dataset/QeMFi_{m}.npz')['EV'][:,:,0]
elif prop=='SCF':
temp_data=np.load(f'../QeMFi/dataset/QeMFi_{m}.npz')['SCF']
y_all[start:end,:] = temp_data[idx,:]
#increment for next molecule
start+= 15000
end += 15000
y_new = np.zeros((5),dtype=object)
X,idx_names = shuffle(X,idx_names,random_state=42)
y_train = np.zeros((5),dtype=object)
for i in range(5):
y_all[:,i] = shuffle(y_all[:,i],random_state=42)
y_new[i] = y_all[:,i]
y_train[i] = y_new[i][:120000]
X_train = X[:120000,:]
#y_train = y_new[:11000,:]
X_val = X[120000:122000,:]
y_val = y_new[-1][120000:122000]
X_test = X[122000:,:]
y_test = y_new[-1][122000:]
return X_train, X_val, X_test, y_train, y_val, y_test, idx_names
def main_SCF_SLATM():
X_train, X_val, X_test, y_train, y_val, y_test, idx_names = prep_data(prop='SCF',rep='SLATM')
np.save('Data/SLATM_train.npy',X_train)
np.save('Data/SLATM_val.npy',X_val)
np.save('Data/SLATM_test.npy',X_test)
np.save('Data/SLATM_train.npy',X_train)
np.save('Data/SCF_train.npy',y_train)
np.save('Data/SCF_val.npy',y_val)
np.save('Data/SCF_test.npy',y_test)
np.save('Data/idx_names.npy',idx_names)
def main_EV_CM():
X_train, X_val, X_test, y_train, y_val, y_test, idx_names = prep_data(prop='EV',rep='CM')
np.save('Data/CM_train.npy',X_train)
np.save('Data/CM_val.npy',X_val)
np.save('Data/CM_test.npy',X_test)
np.save('Data/CM_train.npy',X_train)
np.save('Data/EV_train.npy',y_train)
np.save('Data/EV_val.npy',y_val)
np.save('Data/EV_test.npy',y_test)
#np.save('Data/idx_names.npy',idx_names)
main_EV_CM()