-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_blobs.py
128 lines (95 loc) · 3.34 KB
/
create_blobs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
Process a single `.smiles` file and dump every Million molecules a blob storing
the precalculated fingerprint. Fingerprint is the RDKit Fingerprint (Daylight).
python create_blobs.py -n 1 --outpath blobs --input_folder data
"""
import os
import pickle
import argparse
import profile
import fnmatch
from datetime import datetime
import rdkit
from rdkit import Chem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
from tqdm import tqdm
import utils
from utils import add_arguments
from utils import process_args
from utils import format_duration
from utils import BlobsIO
N_PER_FILE = 60262532 # used only for progress bar
parser = argparse.ArgumentParser()
add_arguments(parser)
parser.add_argument("--outpath", type=str, default='./blobs',
help="Path to save blobs of fingerprints.")
parser.add_argument('-n', '--number', type=int, default=2,
help='Select part of enamine real database. 1-12')
args = parser.parse_args()
starttime= datetime.now()
# AllChem.GetMorganFingerprint(reference,2)
# FingerprintMols.FingerprintMol(reference)
# FingerprintMols.GetRDKFingerprint
def process_row(row:str):
"""
Inputs:
row: str
First two tab-seperated entries of string are the
SMILES and ID of the row.
Return:
str:
If the tanimoto threshold is passed, return the
smile, the id and the tanimoto coef. in smiles-format:
"SMILES\tID\tTANIMOTOCOEF"
"""
try:
smiles, idx, *_ = row.split("\t")
mol = Chem.MolFromSmiles(smiles)
fp = Fingerprint(mol) #this takes the bulk of time
return (idx, fp, smiles)
# print(f"Too low Tanimoto Similarity for mol with id {idx}:\t {tanimoto_sim}")
except:
print("Failed to read row:", row)
Fingerprint = FingerprintMols.FingerprintMol
_cwd, inpath, outpath, inputs = process_args(args)
assert args.number in range(1,13), "Select a number from 1 to 12 for flag --number."
#This will pick up the last filename, if it exits, but even it is not matched
for filename in inputs:
# if fnmatch.fnmatch(filename, f'{args.number}.smiles'):
if f'{args.number:02}.smiles' in filename:
print("Selected:", filename)
break
print("Selected file:", filename)
part_id = utils.find_int(filename)
i=1
cache = []
blobs = []
with open(filename) as f:
blob_io = BlobsIO(part=part_id, path=outpath,
overwrite=True if args.force else False)
for row in tqdm(f, desc=f'Part {part_id:02}',
mininterval=20,
maxinterval=60,
ascii=True, total=N_PER_FILE):
t_result = process_row(row)
if t_result is not None:
cache.append(t_result)
if (i % 1000001) == 0:
fname = blob_io.write_blob(cache=cache)
blobs.append(fname)
cache = []
i += 1
fname = blob_io.write_blob(cache=cache)
blobs.append(fname)
cache = []
endtime = datetime.now()
print(format_duration(starttime, endtime))
# fname =
# gen = read_blob(fname)
# for r in gen: print(r)
# ref = 'COC1=C(OCCCN2CCOCC2)C=C2C(NC3=CC(Cl)=C(F)C=C3)=NC=NC2=C1'
# ref = Chem.MolFromSmiles(ref)
# fp = Fingerprint(ref)
# fp_explicit = rdkit.DataStructs.cDataStructs.ExplicitBitVect