autore_perloutput.py

'''
A script to work up the results of an auto RE script

July 28 2022 11:11 am Bryan lau

takes the autoRE result.txt and applies it to all other functions
'''

import os
from collections import Counter

import numpy as np
import pandas as pd

def ref_to_counter(A):
    '''
    take a dataframe in format of 'reference.txt' and convert it to an array
    of Counters, for counting total atoms in a reaction
    '''
    for ix in range(5,9):
        A.iloc[:,ix] = A.iloc[:,ix].str.replace('x','0').astype(int)
    # use counters to represent a molecule, so they can be added easily
    mol_counter = []
    for ix in range(A.shape[0]):
        mol_dict = {}
        for iy in range(1,5):
            if A.iloc[ix,iy+4] == 0:
                continue
            mol_dict[A.iloc[ix,iy]] = A.iloc[ix,iy+4]
        mol_counter.append(Counter(mol_dict))

    return np.array(mol_counter)

dataset = ["PlatonicTAE6","AlkAtom19","TAE140"]
dataset = ["AlkAtom19","TAE140"]
dataset = ["TAE140"]

folder = 'data'
path = os.path.join(folder,'autoRE','RefValues','result_' + '-'.join(dataset) + '.txt')
names = ['#','A','1+','B','->','C','2+','D','Ref']
usecols = [0,1,3,4,6,7,9,10,13]
# final frame that will be printed to CSV
ARE = pd.read_csv(path,sep=' ',names=names, usecols=usecols)
ARE['#'] = ARE['#'].str.rstrip(':')

# indices generated by autoRE script
names = ['A','B','C','D']
usecols = [2,5,8,11]
AREI = pd.read_csv(path,sep=' ',names=names, usecols=usecols)
for c in AREI.columns:
    AREI[c] = AREI[c].str.lstrip('(').str.rstrip(')')
AREI = AREI.astype('int')
AREI = AREI.to_numpy()

# get two pieces of information from the reference text
path = os.path.join(folder,'autoRE','RefValues','reference_' + '-'.join(dataset) + '.txt')
usecols = range(0,9)
A = pd.read_csv(path,sep=' ', usecols=usecols, header=None).astype(str)

# unique atoms - use as column title
atoms = A.loc[:,[1,2,3,4]].apply(lambda x: ' '.join(x),axis=1).str.rstrip(' x')
atoms = atoms.str.split(' ').tolist()[1:]
atoms = [e[0] for e in atoms]
atoms = sorted(list(set(atoms)))
atoms = '-'.join(atoms)

# count number of unique atoms per reaction
mol = ref_to_counter(A)
mol_rxn = mol[AREI[:,0]] + mol[AREI[:,1]]
# convert from counter to string (saving to csv)
dmol = ['-'.join([v+str(k) for v,k in r.items()]) for r in mol_rxn]

# for each reference.txt, rebuild the reactions with indices in R
file = 'IndValues.csv'
path = os.path.join(folder,file)
T = pd.read_csv(path,nrows=0)

RE = np.zeros((AREI.shape[0],T.columns.size-3))
for ind,col in enumerate(T.columns[3:]):
    # take reference data and add them up according to the indices above
    path = os.path.join(folder,'autoRE',col,'reference_' + '-'.join(dataset) + '.txt')
    names = ['TAE']
    usecols = [9]
    R = pd.read_csv(path,sep=' ',names=names, usecols=usecols)
    R = np.squeeze(R.to_numpy())
    dE = R[AREI[:,0]] + R[AREI[:,1]] - R[AREI[:,2]] - R[AREI[:,3]]
    RE[:,ind] = dE - ARE.Ref.values

ARE = pd.concat([ARE,pd.DataFrame(RE,columns=T.columns[3:])],axis=1)
ARE.insert(8,atoms,dmol)
path = os.path.join(folder,'autoRE_' + '-'.join(dataset) +  '.csv')
#ARE.to_csv(path,index=False)

if 1 == 0:
    # gross hack because I only wanted this code to run once
    # also add a formula for the reaction for the Bartlett data set
    dataset = 'c7cp00757d2.csv'
    path = os.path.join(folder,dataset)
    ARE = pd.read_csv(path)
    # build a dictionary to use pandas' replace feature
    molkey = pd.concat([A[0],pd.Series(mol)],axis=1,names=['Name',atoms])
    molkey.set_axis(['Name',atoms],axis=1,inplace=True)
    molkey = {molkey.iloc[i][0]:molkey.iloc[i][1] for i in range(len(molkey))}
    dmol = ARE.iloc[:,[1,3]].applymap(lambda x: molkey[x])
    dmol = dmol['A'] + dmol['B']
    dmol = ['-'.join([v+str(k) for v,k in r.items()]) for r in dmol]
    ARE[atoms] = dmol
    ARE.to_csv(path,index=False)