Fix code smells

Signed-off-by: Giorgos Paraskevopoulos <[email protected]>
georgepar · Mar 10, 2021 · 3a54132 · 3a54132
1 parent 3310e53
commit 3a54132
Show file tree

Hide file tree

Showing 13 changed files with 52 additions and 58 deletions.
diff --git a/slp/config/nlp.py b/slp/config/nlp.py
@@ -32,7 +32,7 @@ def has_token(cls, token):
         Returns:
             bool: True if token exists, False if not
         """
-        return any(token == t.name or token == t.value for t in cls)
+        return any(token in {t.name, t.value} for t in cls)
 
     @classmethod
     def to_list(cls):

diff --git a/slp/data/corpus.py b/slp/data/corpus.py
@@ -5,13 +5,12 @@
 from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast
 
 import numpy as np
-from loguru import logger
-from tqdm import tqdm
-
 import slp.util.system as system
 import slp.util.types as types
+from loguru import logger
 from slp.config.nlp import SPECIAL_TOKENS
 from slp.data.transforms import HuggingFaceTokenizer, SpacyTokenizer, ToTokenIds
+from tqdm import tqdm
 
 
 def create_vocab(
@@ -119,10 +118,7 @@ def in_accepted_vocab(self, word: str) -> bool:
             bool: Word exists
         """
 
-        if self.vocab is None:
-            return True
-        else:
-            return word in self.vocab
+        return True if self.vocab is None else word in self.vocab
 
     def _get_cache_name(self) -> str:
         """Create a cache file name to avoid reloading the embeddings
@@ -134,7 +130,7 @@ def _get_cache_name(self) -> str:
             str: Cache file name
         """
         head, tail = os.path.split(self.embeddings_file)
-        filename, ext = os.path.splitext(tail)
+        filename, _ = os.path.splitext(tail)
 
         if self.vocab is not None:
             cache_name = os.path.join(head, f"{filename}.{len(self.vocab)}.p")
@@ -527,7 +523,6 @@ def __getitem__(self, idx) -> List[int]:
         Returns:
             List[int]: List of token indices for sentence
         """
-        indices = self.corpus_indices_[idx]
         out: List[int] = (
             self.corpus_indices_[idx]
             if self.max_len <= 0
@@ -682,7 +677,7 @@ def __len__(self) -> int:
 
         return len(self.corpus_indices_)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx) -> List[int]:
         """Get ith element in corpus as token indices
 
         Args:
@@ -691,13 +686,14 @@ def __getitem__(self, idx):
         Returns:
             List[int]: List of token indices for sentence
         """
-
-        return (
+        out: List[int] = (
             self.corpus_indices_[idx]
             if self.max_len <= 0
             else self.corpus_indices_[idx][: self.max_len]
         )
 
+        return out
+
 
 class TokenizedCorpus(object):
     def __init__(

diff --git a/slp/modules/attention.py b/slp/modules/attention.py
@@ -14,9 +14,9 @@ def attention_scores(
     dropout: float = 0.2,
     training: bool = True,
 ) -> torch.Tensor:
-    """Calculate attention scores for scaled dot product attention
+    r"""Calculate attention scores for scaled dot product attention
 
-    $$s = softmax(\\frac{Q \cdot K^T}{\sqrt{d}})$$
+    $$s = softmax(\frac{Q \cdot K^T}{\sqrt{d}})$$
 
     * B: Batch size
     * L: Keys Sequence length
@@ -80,11 +80,11 @@ def forward(
         values: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Single-head scaled dot-product attention forward pass
+        r"""Single-head scaled dot-product attention forward pass
 
         Outputs the values, where features for each sequence element are weighted by their respective attention scores
 
-        $$a = softmax(\\frac{Q}{K^T}){\sqrt{d}}) \dot V$$
+        $$a = softmax(\frac{Q}{K^T}){\sqrt{d}}) \dot V$$
 
         * B: Batch size
         * L: Keys Sequence length
@@ -127,7 +127,7 @@ def forward(
         return out, scores
 
     def _reset_parameters(self):
-        """xavier uniform init for Linear layer weights"""
+        """Xavier uniform init for Linear layer weights"""
         nn.init.xavier_uniform_(self.k.weight)
         nn.init.xavier_uniform_(self.q.weight)
         nn.init.xavier_uniform_(self.v.weight)
@@ -200,13 +200,13 @@ def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
         return x.view(batch_size, max_length, -1)
 
     def forward(self, keys, queries=None, values=None, attention_mask=None):
-        """Multi-head scaled dot-product attention forward pass
+        r"""Multi-head scaled dot-product attention forward pass
 
         Outputs the values, where features for each sequence element are weighted by their respective attention scores
 
         Each head performs dot-product attention
 
-        $$a_H = softmax(\\frac{Q_H \cdot K_H^T}{\sqrt{d}}) \cdot V_H$$
+        $$a_H = softmax(\frac{Q_H \cdot K_H^T}{\sqrt{d}}) \cdot V_H$$
 
         The outputs of multiple heads are concatenated and passed through a feedforward layer.
 
@@ -255,7 +255,7 @@ def forward(self, keys, queries=None, values=None, attention_mask=None):
         return out
 
     def _reset_parameters(self):
-        """xavier uniform init for Linear layer weights"""
+        """Xavier uniform init for Linear layer weights"""
         nn.init.xavier_uniform_(self.k.weight)
         nn.init.xavier_uniform_(self.q.weight)
         nn.init.xavier_uniform_(self.v.weight)

diff --git a/slp/modules/embed.py b/slp/modules/embed.py
@@ -11,19 +11,19 @@
 
 class PositionalEncoding(nn.Module):
     def __init__(self, embedding_dim: int = 512, max_len: int = 5000):
-        """Inject some information about the relative or absolute position of the tokens in the sequence.
+        r"""Inject some information about the relative or absolute position of the tokens in the sequence.
 
         The positional encodings have the same dimension as
         the embeddings, so that the two can be summed. Here, we use sine and cosine
         functions of different frequencies.
 
         PE for even positions:
 
-        $$\\text{PosEncoder}(pos, 2i) = sin(\\frac{pos}{10000^{\\frac{2i}{d}}})$$
+        $$\text{PosEncoder}(pos, 2i) = sin(\frac{pos}{10000^{\frac{2i}{d}}})$$
 
         PE for odd positions:
 
-        $$\\text{PosEncoder}(pos, 2i+1) = cos(\\frac{pos}{10000^{\\frac{2i}{d}}})$$
+        $$\text{PosEncoder}(pos, 2i+1) = cos(\frac{pos}{10000^{\frac{2i}{d}}})$$
 
         where $pos$ is the word position and $i$ is the embedding idx
 

diff --git a/slp/modules/feedforward.py b/slp/modules/feedforward.py
@@ -84,7 +84,7 @@ def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
         self.net = nn.Sequential(self.ff1, self.drop, self.ff2)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Position-wise FF forward pass
+        r"""Position-wise FF forward pass
 
         $$out = W_2 \dot max(0, W_1 \dot x + b_1) + b_2$$
 

diff --git a/slp/plbind/dm.py b/slp/plbind/dm.py
@@ -499,15 +499,12 @@ def _zip_corpus_and_labels(
 
         if self.language_model:
             train_labels = train
-            train = train
 
             if val is not None:
                 val_labels = val
-                val = val
 
             if test is not None:
                 test_labels = test
-                test = test
 
         train_data = (
             list(zip(train, train_labels)) if train_labels is not None else train
@@ -554,7 +551,7 @@ def _select_corpus_cls(self, corpus_args):
         return corpus_cls, corpus_args
 
     def _force_train_vocab_on_val_and_test(self, corpus_args, train_corpus):
-        if self.tokenizer == "spacy" or self.tokenizer == "tokenized":
+        if self.tokenizer in {"spacy", "tokenized"}:
             # Force train vocabulary on val & test
             corpus_args["word2idx"] = train_corpus.word2idx
 

diff --git a/slp/plbind/module.py b/slp/plbind/module.py
@@ -7,13 +7,12 @@
 import torch.nn as nn
 from loguru import logger
 from omegaconf import DictConfig
-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import _LRScheduler
-
 from slp.config.omegaconf import OmegaConf
 from slp.util.pytorch import pad_mask, subsequent_mask
 from slp.util.system import print_separator
 from slp.util.types import Configuration, LossType
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
 
 
 class _Predictor(ABC):
@@ -35,7 +34,7 @@ def parse_batch(self, batch: Tuple[torch.Tensor, ...]) -> Tuple[torch.Tensor, ..
         Returns:
             Tuple[torch.Tensor, ...]: The processed inputs, ready to provide to the model
         """
-        pass
+        raise NotImplementedError
 
     @abstractmethod
     def get_predictions_and_targets(
@@ -54,7 +53,7 @@ def get_predictions_and_targets(
         Returns:
             Tuple[torch.Tensor, torch.Tensor]: (logits, ground_truths), ready to be passed to the loss function
         """
-        pass
+        raise NotImplementedError
 
 
 class _Classification(_Predictor):
@@ -271,7 +270,7 @@ def get_predictions_and_targets(
 
 
 class _BertSequenceClassification(_Predictor):
-    """ Bert Classification task"""
+    """Bert Classification task"""
 
     def parse_batch(self, batch: Tuple[torch.Tensor, ...]) -> Tuple[torch.Tensor, ...]:
         """Parse incoming batch
@@ -331,7 +330,7 @@ def __init__(
         predictor_cls=_Classification,
         calculate_perplexity: bool = False,  # for LM. Dirty but much more efficient
     ):
-        """LightningModule wrapper for a (model, optimizer, criterion, lr_scheduler) tuple
+        """Wraps a (model, optimizer, criterion, lr_scheduler) tuple in a LightningModule
 
         Handles the boilerplate for metrics calculation and logging and defines the train_step / val_step / test_step
         with use of the predictor helper classes (e.g. _Classification, _RnnClassification)
@@ -383,13 +382,15 @@ def configure_optimizers(self):
         Returns:
             Tuple[List[Optimizer], List[_LRScheduler]]: (optimizers, lr_schedulers)
         """
+
         if self.lr_scheduler is not None:
             return self.optimizer, self.lr_scheduler
-        else:
-            return self.optimizer
+
+        return self.optimizer
 
     def forward(self, *args, **kwargs):
-        """ Call wrapped module forward"""
+        """Call wrapped module forward"""
+
         return self.model(*args, **kwargs)
 
     def _compute_metrics(self, metrics, loss, y_hat, targets, mode="train"):
@@ -405,6 +406,7 @@ def _compute_metrics(self, metrics, loss, y_hat, targets, mode="train"):
 
         def fmt(name):
             """Format metric name"""
+
             return f"{mode}_{name}"
 
         metrics = {f"{mode}_{k}": v(y_hat, targets) for k, v in metrics.items()}
@@ -446,6 +448,7 @@ def aggregate_epoch_metrics(self, outputs, mode="Training"):
 
         def fmt(name):
             """Format metric name"""
+
             return f"{name}" if name != "loss" else "train_loss"
 
         keys = list(outputs[0].keys())
@@ -473,7 +476,7 @@ def training_step(self, batch, batch_idx):
         )
 
         self.log_dict(
-            {k: v for k, v in metrics.items()},
+            metrics,
             on_step=True,
             on_epoch=False,
             logger=True,

diff --git a/slp/plbind/trainer.py b/slp/plbind/trainer.py
@@ -444,15 +444,15 @@ def make_trainer(
         ),
     ]
 
-    logger.info(f"Configured wandb and CSV loggers.")
+    logger.info("Configured wandb and CSV loggers.")
     logger.info(
         f"Wandb configured to run {experiment_name}/{run_id} in project {wandb_project}"
     )
 
     if connected:
-        logger.info(f"Results will be stored online.")
+        logger.info("Results will be stored online.")
     else:
-        logger.info(f"Results will be stored offline due to bad internet connection.")
+        logger.info("Results will be stored offline due to bad internet connection.")
         logger.info(
             f"If you want to upload your results later run\n\t wandb sync {logging_dir}/wandb/run-{run_id}"
         )

diff --git a/slp/util/log.py b/slp/util/log.py
@@ -1,4 +1,3 @@
-import io
 import logging
 from typing import Any, Optional
 
@@ -72,7 +71,7 @@ def emit(self, record):
     logger.remove()
 
     def tqdm_write(msg: str) -> Any:
-        """tqdm write wrapper for loguru"""
+        """Loguru wrapper for tqdm.write"""
         return tqdm.write(msg, end="")
 
     logger.add(tqdm_write, colorize=True)

diff --git a/slp/util/pytorch.py b/slp/util/pytorch.py
@@ -130,9 +130,9 @@ def sort_sequences(
     lengths_sorted, sorted_idx = lengths.sort(descending=True)
     _, unsorted_idx = sorted_idx.sort()
 
-    def unsort(t: torch.Tensor) -> torch.Tensor:
+    def unsort(tt: torch.Tensor) -> torch.Tensor:
         """Restore original unsorted sequence"""
-        return t[unsorted_idx]
+        return tt[unsorted_idx]
 
     return inputs[sorted_idx], lengths_sorted, unsort
 
@@ -217,7 +217,7 @@ def mktensor(
     dtype: torch.dtype = torch.float,
     device: types.Device = "cpu",
     requires_grad: bool = False,
-    copy: bool = True,
+    copy_tensor: bool = True,
 ) -> torch.Tensor:
     """Convert a list or numpy array to torch tensor. If a torch tensor
         is passed it is cast to  dtype, device and the requires_grad flag is
@@ -231,15 +231,15 @@ def mktensor(
         device: (torch.device, str): Device where the tensor should be
             (Default value = 'cpu')
         requires_grad: (bool): Trainable tensor or not? (Default value = False)
-        copy: (bool): If false creates the tensor inplace else makes a copy
+        copy_tensor: (bool): If false creates the tensor inplace else makes a copy
             (Default value = True)
 
     Returns:
         (torch.Tensor): A tensor of appropriate dtype, device and
             requires_grad containing data
 
     """
-    tensor_factory = t if copy else t_
+    tensor_factory = t if copy_tensor else t_
     return tensor_factory(data, dtype=dtype, device=device, requires_grad=requires_grad)
 
 

diff --git a/slp/util/system.py b/slp/util/system.py
@@ -143,7 +143,7 @@ def is_subpath(child: str, parent: str) -> bool:
 
 
 def safe_mkdirs(path: str) -> None:
-    """safe_mkdirs Makes recursively all the directories in input path
+    """Makes recursively all the directories in input path
 
     Utility function similar to mkdir -p. Makes directories recursively, if given path does not exist
 
@@ -162,7 +162,7 @@ def safe_mkdirs(path: str) -> None:
 
 
 def timethis(method=False) -> Callable:
-    """timethis Decorator to measure the time it takes for a function to complete
+    """Decorator to measure the time it takes for a function to complete
 
     Examples:
         >>> @slp.util.system.timethis