-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautore_perloutput.py
109 lines (92 loc) · 3.77 KB
/
autore_perloutput.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
'''
A script to work up the results of an auto RE script
July 28 2022 11:11 am Bryan lau
takes the autoRE result.txt and applies it to all other functions
'''
import os
from collections import Counter
import numpy as np
import pandas as pd
def ref_to_counter(A):
'''
take a dataframe in format of 'reference.txt' and convert it to an array
of Counters, for counting total atoms in a reaction
'''
for ix in range(5,9):
A.iloc[:,ix] = A.iloc[:,ix].str.replace('x','0').astype(int)
# use counters to represent a molecule, so they can be added easily
mol_counter = []
for ix in range(A.shape[0]):
mol_dict = {}
for iy in range(1,5):
if A.iloc[ix,iy+4] == 0:
continue
mol_dict[A.iloc[ix,iy]] = A.iloc[ix,iy+4]
mol_counter.append(Counter(mol_dict))
return np.array(mol_counter)
dataset = ["PlatonicTAE6","AlkAtom19","TAE140"]
dataset = ["AlkAtom19","TAE140"]
dataset = ["TAE140"]
folder = 'data'
path = os.path.join(folder,'autoRE','RefValues','result_' + '-'.join(dataset) + '.txt')
names = ['#','A','1+','B','->','C','2+','D','Ref']
usecols = [0,1,3,4,6,7,9,10,13]
# final frame that will be printed to CSV
ARE = pd.read_csv(path,sep=' ',names=names, usecols=usecols)
ARE['#'] = ARE['#'].str.rstrip(':')
# indices generated by autoRE script
names = ['A','B','C','D']
usecols = [2,5,8,11]
AREI = pd.read_csv(path,sep=' ',names=names, usecols=usecols)
for c in AREI.columns:
AREI[c] = AREI[c].str.lstrip('(').str.rstrip(')')
AREI = AREI.astype('int')
AREI = AREI.to_numpy()
# get two pieces of information from the reference text
path = os.path.join(folder,'autoRE','RefValues','reference_' + '-'.join(dataset) + '.txt')
usecols = range(0,9)
A = pd.read_csv(path,sep=' ', usecols=usecols, header=None).astype(str)
# unique atoms - use as column title
atoms = A.loc[:,[1,2,3,4]].apply(lambda x: ' '.join(x),axis=1).str.rstrip(' x')
atoms = atoms.str.split(' ').tolist()[1:]
atoms = [e[0] for e in atoms]
atoms = sorted(list(set(atoms)))
atoms = '-'.join(atoms)
# count number of unique atoms per reaction
mol = ref_to_counter(A)
mol_rxn = mol[AREI[:,0]] + mol[AREI[:,1]]
# convert from counter to string (saving to csv)
dmol = ['-'.join([v+str(k) for v,k in r.items()]) for r in mol_rxn]
# for each reference.txt, rebuild the reactions with indices in R
file = 'IndValues.csv'
path = os.path.join(folder,file)
T = pd.read_csv(path,nrows=0)
RE = np.zeros((AREI.shape[0],T.columns.size-3))
for ind,col in enumerate(T.columns[3:]):
# take reference data and add them up according to the indices above
path = os.path.join(folder,'autoRE',col,'reference_' + '-'.join(dataset) + '.txt')
names = ['TAE']
usecols = [9]
R = pd.read_csv(path,sep=' ',names=names, usecols=usecols)
R = np.squeeze(R.to_numpy())
dE = R[AREI[:,0]] + R[AREI[:,1]] - R[AREI[:,2]] - R[AREI[:,3]]
RE[:,ind] = dE - ARE.Ref.values
ARE = pd.concat([ARE,pd.DataFrame(RE,columns=T.columns[3:])],axis=1)
ARE.insert(8,atoms,dmol)
path = os.path.join(folder,'autoRE_' + '-'.join(dataset) + '.csv')
#ARE.to_csv(path,index=False)
if 1 == 0:
# gross hack because I only wanted this code to run once
# also add a formula for the reaction for the Bartlett data set
dataset = 'c7cp00757d2.csv'
path = os.path.join(folder,dataset)
ARE = pd.read_csv(path)
# build a dictionary to use pandas' replace feature
molkey = pd.concat([A[0],pd.Series(mol)],axis=1,names=['Name',atoms])
molkey.set_axis(['Name',atoms],axis=1,inplace=True)
molkey = {molkey.iloc[i][0]:molkey.iloc[i][1] for i in range(len(molkey))}
dmol = ARE.iloc[:,[1,3]].applymap(lambda x: molkey[x])
dmol = dmol['A'] + dmol['B']
dmol = ['-'.join([v+str(k) for v,k in r.items()]) for r in dmol]
ARE[atoms] = dmol
ARE.to_csv(path,index=False)