Skip to content

Commit

Permalink
Remove the deprecated use_dgl argument and its accompanying logic
Browse files Browse the repository at this point in the history
  • Loading branch information
amorehead committed Nov 1, 2021
1 parent 1df2300 commit f77978b
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 146 deletions.
14 changes: 7 additions & 7 deletions project/datasets/CASP_CAPRI/casp_capri_dgl_data_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class CASPCAPRIDGLDataModule(LightningDataModule):
casp_capri_test = None

def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int, knn: int, self_loops: bool,
pn_ratio: float, percent_to_use: float, use_dgl: bool, process_complexes: bool, input_indep: bool):
pn_ratio: float, percent_to_use: float, process_complexes: bool, input_indep: bool):
super().__init__()

self.data_dir = data_dir
Expand All @@ -29,26 +29,26 @@ def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int,
self.self_loops = self_loops
self.pn_ratio = pn_ratio
self.percent_to_use = percent_to_use # Fraction of CASP-CAPRI dataset splits to use
self.use_dgl = use_dgl # Whether to process each complex into a pair of DGL graphs for its final representation
self.process_complexes = process_complexes # Whether to process any unprocessed complexes before training
self.input_indep = input_indep # Whether to use an input-independent pipeline to train the model
self.collate_fn = dgl_picp_collate # Which collation function to use

def setup(self, stage: Optional[str] = None):
# Assign testing dataset for use in DataLoaders - called on every GPU
self.casp_capri_test = CASPCAPRIDGLDataset(mode='test', raw_dir=self.data_dir, knn=self.knn,
self_loops=self.self_loops, pn_ratio=self.pn_ratio,
percent_to_use=self.percent_to_use, use_dgl=self.use_dgl,
geo_nbrhd_size=2, self_loops=self.self_loops, pn_ratio=self.pn_ratio,
percent_to_use=self.percent_to_use,
process_complexes=self.process_complexes,
input_indep=self.input_indep)

def train_dataloader(self) -> DataLoader:
return DataLoader(self.casp_capri_test, batch_size=self.batch_size, shuffle=True,
num_workers=self.num_dataloader_workers, collate_fn=dgl_picp_collate, pin_memory=True)
num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)

def val_dataloader(self) -> DataLoader:
return DataLoader(self.casp_capri_test, batch_size=self.batch_size, shuffle=False,
num_workers=self.num_dataloader_workers, collate_fn=dgl_picp_collate, pin_memory=True)
num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)

def test_dataloader(self) -> DataLoader:
return DataLoader(self.casp_capri_test, batch_size=self.batch_size, shuffle=False,
num_workers=self.num_dataloader_workers, collate_fn=dgl_picp_collate, pin_memory=True)
num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)
29 changes: 3 additions & 26 deletions project/datasets/CASP_CAPRI/casp_capri_dgl_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,10 @@ class CASPCAPRIDGLDataset(DGLDataset):
Size of each edge's neighborhood when updating geometric edge features. Default: 2.
self_loops: bool
Whether to connect a given node to itself. Default: True.
pn_ratio: bool
pn_ratio: float
The positive-negative ratio to use when assembling training labels for node-node pairs. Default: 0.1.
percent_to_use: float
How much of the dataset to load. Default: 1.00.
use_dgl: bool
Whether to process each complex into a pair of DGL graphs for its final representation. Default: True.
process_complexes: bool
Whether to process each unprocessed complex as we load in the dataset. Default: True.
input_indep: bool
Expand Down Expand Up @@ -72,7 +70,6 @@ def __init__(self,
self_loops=True,
pn_ratio=0.1,
percent_to_use=1.00,
use_dgl=True,
process_complexes=True,
input_indep=False,
force_reload=False,
Expand All @@ -87,7 +84,6 @@ def __init__(self,
self.self_loops = self_loops
self.pn_ratio = pn_ratio
self.percent_to_use = percent_to_use # How much of the dataset (e.g. CASP-CAPRI training dataset) to use
self.use_dgl = use_dgl # Whether to process each complex into a pair of DGL graphs for its final representation
self.process_complexes = process_complexes # Whether to process any unprocessed complexes before training
self.input_indep = input_indep # Whether to use an input-independent pipeline to train the model
self.final_dir = os.path.join(*self.root.split(os.sep)[:-1])
Expand Down Expand Up @@ -163,9 +159,8 @@ def process(self):
if not os.path.exists(processed_filepath):
processed_parent_dir_to_make = os.path.join(self.processed_dir, os.path.split(raw_path[0])[0])
os.makedirs(processed_parent_dir_to_make, exist_ok=True)
process_complex_into_dict(raw_filepath, processed_filepath,
self.knn, self.geo_nbrhd_size, self.self_loops,
check_sequence=False, use_dgl=self.use_dgl)
process_complex_into_dict(raw_filepath, processed_filepath, self.knn,
self.geo_nbrhd_size, self.self_loops, check_sequence=False)

def has_cache(self):
"""Check if each complex is downloaded and available for testing."""
Expand All @@ -189,28 +184,10 @@ def __getitem__(self, idx):
-------
:class:`dict`
(If process_complexes_into_dicts() was run with use_dgl=True):
Protein complex, DGLGraphs for each of the complex's structures.
- ``complex['graph1']:`` DGLGraph (of length M) containing each of the first graph's encoded node and edge features
- ``complex['graph2']:`` DGLGraph (of length N) containing each of the second graph's encoded node and edge features
- ``complex['examples']:`` PyTorch Tensor (of shape (M x N) x 3) containing the labels for inter-graph node pairs
- ``complex['complex']:`` Python string describing the complex's code and original pdb filename
- ``complex['filepath']:`` Python string describing the complex's filepath
(If process_complexes_into_dicts() was run with use_dgl=False):
Protein complex, feature tensors for each node and edge and indices of each node's neighboring nodes.
- ``complex['graph1_node_feats']:`` PyTorch Tensor containing each of the first graph's encoded node features
- ``complex['graph2_node_feats']``: PyTorch Tensor containing each of the second graph's encoded node features
- ``complex['graph1_node_coords']:`` PyTorch Tensor containing each of the first graph's node coordinates
- ``complex['graph2_node_coords']``: PyTorch Tensor containing each of the second graph's node coordinates
- ``complex['graph1_edge_feats']:`` PyTorch Tensor containing each of the first graph's edge features for each node
- ``complex['graph2_edge_feats']:`` PyTorch Tensor containing each of the second graph's edge features for each node
- ``complex['graph1_nbrhd_indices']:`` PyTorch Tensor containing each of the first graph's neighboring node indices
- ``complex['graph2_nbrhd_indices']:`` PyTorch Tensor containing each of the second graph's neighboring node indices
- ``complex['examples']:`` PyTorch Tensor containing the labels for inter-graph node pairs
- ``complex['complex']:`` Python string describing the complex's code and original pdb filename
- ``complex['filepath']:`` Python string describing the complex's filepath
"""
# Assemble filepath of processed protein complex
Expand Down
18 changes: 8 additions & 10 deletions project/datasets/DIPS/dips_dgl_data_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class DIPSDGLDataModule(LightningDataModule):
dips_test = None

def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int, knn: int, self_loops: bool,
pn_ratio: float, percent_to_use: float, use_dgl: bool, process_complexes: bool, input_indep: bool):
pn_ratio: float, percent_to_use: float, process_complexes: bool, input_indep: bool):
super().__init__()

self.data_dir = data_dir
Expand All @@ -31,32 +31,30 @@ def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int,
self.self_loops = self_loops
self.pn_ratio = pn_ratio
self.percent_to_use = percent_to_use # Fraction of DIPS dataset splits to use
self.use_dgl = use_dgl # Whether to process each complex into a pair of DGL graphs for its final representation
self.process_complexes = process_complexes # Whether to process any unprocessed complexes before training
self.input_indep = input_indep # Whether to use an input-independent pipeline to train the model
self.collate_fn = dgl_picp_collate # Which collation function to use

def setup(self, stage: Optional[str] = None):
# Assign training/validation/testing data set for use in DataLoaders - called on every GPU
self.dips_train = DIPSDGLDataset(mode='train', raw_dir=self.data_dir, knn=self.knn, self_loops=self.self_loops,
pn_ratio=self.pn_ratio, percent_to_use=self.percent_to_use,
use_dgl=self.use_dgl, process_complexes=self.process_complexes,
input_indep=self.input_indep)
process_complexes=self.process_complexes, input_indep=self.input_indep)
self.dips_val = DIPSDGLDataset(mode='val', raw_dir=self.data_dir, knn=self.knn, self_loops=self.self_loops,
pn_ratio=self.pn_ratio, percent_to_use=self.percent_to_use, use_dgl=self.use_dgl,
pn_ratio=self.pn_ratio, percent_to_use=self.percent_to_use,
process_complexes=self.process_complexes, input_indep=self.input_indep)
self.dips_test = DIPSDGLDataset(mode='test', raw_dir=self.data_dir, knn=self.knn, self_loops=self.self_loops,
pn_ratio=self.pn_ratio, percent_to_use=self.percent_to_use,
use_dgl=self.use_dgl, process_complexes=self.process_complexes,
input_indep=self.input_indep)
process_complexes=self.process_complexes, input_indep=self.input_indep)

def train_dataloader(self) -> DataLoader:
return DataLoader(self.dips_train, batch_size=self.batch_size, shuffle=True,
num_workers=self.num_dataloader_workers, collate_fn=dgl_picp_collate, pin_memory=True)
num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)

def val_dataloader(self) -> DataLoader:
return DataLoader(self.dips_val, batch_size=self.batch_size, shuffle=False,
num_workers=self.num_dataloader_workers, collate_fn=dgl_picp_collate, pin_memory=True)
num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)

def test_dataloader(self) -> DataLoader:
return DataLoader(self.dips_test, batch_size=self.batch_size, shuffle=False,
num_workers=self.num_dataloader_workers, collate_fn=dgl_picp_collate, pin_memory=True)
num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)
27 changes: 2 additions & 25 deletions project/datasets/DIPS/dips_dgl_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ class DIPSDGLDataset(DGLDataset):
The positive-negative ratio to use when assembling training labels for node-node pairs. Default: 0.1.
percent_to_use: float
How much of the dataset to load. Default: 1.00.
use_dgl: bool
Whether to process each complex into a pair of DGL graphs for its final representation. Default: True.
process_complexes: bool
Whether to process each unprocessed complex as we load in the dataset. Default: True.
input_indep: bool
Expand Down Expand Up @@ -83,7 +81,6 @@ def __init__(self,
self_loops=True,
pn_ratio=0.1,
percent_to_use=1.00,
use_dgl=True,
process_complexes=True,
input_indep=False,
train_viz=False,
Expand All @@ -99,7 +96,6 @@ def __init__(self,
self.self_loops = self_loops
self.pn_ratio = pn_ratio
self.percent_to_use = percent_to_use # How much of the requested dataset (e.g. DIPS-Plus) to use
self.use_dgl = use_dgl # Whether to process each complex into a pair of DGL graphs for its final representation
self.process_complexes = process_complexes # Whether to process any unprocessed complexes before training
self.input_indep = input_indep # Whether to use an input-independent pipeline to train the model
self.train_viz = train_viz # Whether to curate the training loop's validation samples for visualization
Expand Down Expand Up @@ -183,9 +179,8 @@ def process(self):
if not os.path.exists(processed_filepath):
processed_parent_dir_to_make = os.path.join(self.processed_dir, os.path.split(raw_path[0])[0])
os.makedirs(processed_parent_dir_to_make, exist_ok=True)
process_complex_into_dict(raw_filepath, processed_filepath,
self.knn, self.geo_nbrhd_size, self.self_loops,
check_sequence=False, use_dgl=self.use_dgl)
process_complex_into_dict(raw_filepath, processed_filepath, self.knn,
self.geo_nbrhd_size, self.self_loops, check_sequence=False)

def has_cache(self):
"""Check if each complex is downloaded and available for training, validation, or testing."""
Expand All @@ -209,28 +204,10 @@ def __getitem__(self, idx):
-------
:class:`dict`
(If process_complexes_into_dicts() was run with use_dgl=True):
Protein complex, DGLGraphs for each of the complex's structures.
- ``complex['graph1']:`` DGLGraph (of length M) containing each of the first graph's encoded node and edge features
- ``complex['graph2']:`` DGLGraph (of length N) containing each of the second graph's encoded node and edge features
- ``complex['examples']:`` PyTorch Tensor (of shape (M x N) x 3) containing the labels for inter-graph node pairs
- ``complex['complex']:`` Python string describing the complex's code and original pdb filename
- ``complex['filepath']:`` Python string describing the complex's filepath
(If process_complexes_into_dicts() was run with use_dgl=False):
Protein complex, feature tensors for each node and edge and indices of each node's neighboring nodes.
- ``complex['graph1_node_feats']:`` PyTorch Tensor containing each of the first graph's encoded node features
- ``complex['graph2_node_feats']``: PyTorch Tensor containing each of the second graph's encoded node features
- ``complex['graph1_node_coords']:`` PyTorch Tensor containing each of the first graph's node coordinates
- ``complex['graph2_node_coords']``: PyTorch Tensor containing each of the second graph's node coordinates
- ``complex['graph1_edge_feats']:`` PyTorch Tensor containing each of the first graph's edge features for each node
- ``complex['graph2_edge_feats']:`` PyTorch Tensor containing each of the second graph's edge features for each node
- ``complex['graph1_nbrhd_indices']:`` PyTorch Tensor containing each of the first graph's neighboring node indices
- ``complex['graph2_nbrhd_indices']:`` PyTorch Tensor containing each of the second graph's neighboring node indices
- ``complex['examples']:`` PyTorch Tensor containing the labels for inter-graph node pairs
- ``complex['complex']:`` Python string describing the complex's code and original pdb filename
- ``complex['filepath']:`` Python string describing the complex's filepath
"""
# Assemble filepath of processed protein complex
Expand Down
Loading

0 comments on commit f77978b

Please sign in to comment.