From 96eb13870a80691b93329b31d8d6c986e7ed4435 Mon Sep 17 00:00:00 2001 From: Patrick_Cin Date: Tue, 23 Feb 2021 13:29:56 +0700 Subject: [PATCH] Edit readme (#26) * refactor pkg name * update README * update year in readme * remove ruby folder --- README.md | 18 ++-- ruby/__init__.py | 4 - ruby/translator.py | 83 ------------------- ruby/utils/__init__.py | 1 - ruby/utils/misc.py | 4 - ruby/utils/tree_manipulation.py | 141 -------------------------------- 6 files changed, 9 insertions(+), 242 deletions(-) delete mode 100644 ruby/__init__.py delete mode 100644 ruby/translator.py delete mode 100644 ruby/utils/__init__.py delete mode 100644 ruby/utils/misc.py delete mode 100644 ruby/utils/tree_manipulation.py diff --git a/README.md b/README.md index e5ec133..1c6961a 100644 --- a/README.md +++ b/README.md @@ -5,13 +5,13 @@ *Why not translate it yourself when Google Translate cannot satisfy you❓* -[![CircleCI](https://circleci.com/gh/urbans/urbans/tree/master.svg?style=svg)](https://circleci.com/gh/urbans/urbans/tree/master) -[![Codacy Badge](https://app.codacy.com/project/badge/Grade/b4937f1f9fe0477b9fc557cbedf92b24)](https://www.codacy.com/gh/urbans/urbans?utm_source=github.com&utm_medium=referral&utm_content=urbans/urbans&utm_campaign=Badge_Grade) -[![Codacy Badge](https://app.codacy.com/project/badge/Coverage/b4937f1f9fe0477b9fc557cbedf92b24)](https://www.codacy.com/gh/urbans/urbans?utm_source=github.com&utm_medium=referral&utm_content=urbans/urbans&utm_campaign=Badge_Coverage) +[![CircleCI](https://circleci.com/gh/pyurbans/urbans/tree/master.svg?style=svg)](https://circleci.com/gh/pyurbans/urbans/tree/master) +[![Codacy Badge](https://app.codacy.com/project/badge/Grade/b4937f1f9fe0477b9fc557cbedf92b24)](https://www.codacy.com/gh/pyurbans/urbans?utm_source=github.com&utm_medium=referral&utm_content=pyurbans/urbans&utm_campaign=Badge_Grade) +[![Codacy Badge](https://app.codacy.com/project/badge/Coverage/b4937f1f9fe0477b9fc557cbedf92b24)](https://www.codacy.com/gh/pyurbans/urbans?utm_source=github.com&utm_medium=referral&utm_content=pyurbans/urbans&utm_campaign=Badge_Coverage) [![PyPI version](https://badge.fury.io/py/urbans.svg)](https://badge.fury.io/py/urbans) -[![GitHub release](https://img.shields.io/github/release/urbans/urbans.svg)](https://GitHub.com/urbans/urbans/releases/) -[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://GitHub.com/urbans/urbans/graphs/commit-activity) -[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/urbans/urbans/blob/master/LICENSE) +[![GitHub release](https://img.shields.io/github/release/pyurbans/urbans.svg)](https://GitHub.com/pyurbans/urbans/releases/) +[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://GitHub.com/pyurbans/urbans/graphs/commit-activity) +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/pyurbans/urbans/blob/master/LICENSE) @@ -68,7 +68,7 @@ trans_sentences = translator.translate(src_sentences) ``` ## ⚖️ License -This repository is using the Apache 2.0 license that is listed in the repo. Please take a look at [`LICENSE`](https://github.com/urbans/urbans/blob/master/LICENSE) as you wish. +This repository is using the Apache 2.0 license that is listed in the repo. Please take a look at [`LICENSE`](https://github.com/pyurbans/urbans/blob/master/LICENSE) as you wish. ## ✍️ BibTeX If you wish to cite the framework feel free to use this (but only if you loved it 😊): @@ -76,10 +76,10 @@ If you wish to cite the framework feel free to use this (but only if you loved i @misc{phat2020urbans, author = {Truong-Phat Nguyen}, title = {URBANS: Universal Rule-Based Machine Translation NLP toolkit}, - year = {2020}, + year = {2021}, publisher = {GitHub}, journal = {GitHub repository}, - howpublished = {\url{https://github.com/urbans/urbans}}, + howpublished = {\url{https://github.com/pyurbans/urbans}}, } ``` diff --git a/ruby/__init__.py b/ruby/__init__.py deleted file mode 100644 index 28b1bb9..0000000 --- a/ruby/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Import production-ready tools of RUBY.""" -from .translator import Translator - -__all__ = ['Translator',] diff --git a/ruby/translator.py b/ruby/translator.py deleted file mode 100644 index a7b4447..0000000 --- a/ruby/translator.py +++ /dev/null @@ -1,83 +0,0 @@ -from typing import Dict, List -from .utils.tree_manipulation import translate_trees_grammar -from .utils.misc import remove_trailing_space -import nltk -from nltk.parse.chart import BottomUpLeftCornerChartParser as Parser - -class Translator: - """""" - def __init__(self, - src_grammar: str, - src_to_tgt_grammar: Dict, - src_to_tgt_dictionary: Dict): - """ - Initialize the translator - Args: - src_grammar (str): source language grammar written in nltk style - E.g: src_grammar = \""" - S -> NP VP - NP -> PRP - VP -> VB PP - PP -> PB NP - NP -> CD NP1 - NP1 -> JJ NN - PRP -> 'I' - VB -> 'go' - PB -> 'to' - CD -> 'a' - JJ -> 'good' - NN -> 'school' - \""" - src_to_tgt_grammar (Dict): Transition between source grammar and target grammar as a dict - E.g: src2target_grammar = { - "NP1 -> JJ NN": "NP1 -> NN JJ" - } - src_to_tgt_dictionary (Dict): Dictionary of word-by-word transition from src language to target language - E.g: en_to_vi_dict = { - "I":"tôi", - "go":"đi", - "to":"tới", - "school":"ngôi_trường", - ... - } - """ - self.src_grammar = nltk.CFG.fromstring(self.__process_text_input(src_grammar)) - self.parser = Parser(self.src_grammar) - self.src_to_tgt_grammar = src_to_tgt_grammar - self.src_to_tgt_dictionary = src_to_tgt_dictionary - - @staticmethod - def __process_text_input(txt): - return remove_trailing_space(txt) - - def translate(self, sentences: List[str] or str, allow_multiple_translation = False): - """ - Translate a list of sentences - Args: - sentences (List[str]): A list of str-typed sentences - Returns: - List[str]: A list of translated sentences - """ - if isinstance(sentences,str): - sentences = [sentences] - - translated_sentences = [] - failed_sentences = [] - - for sentence in sentences: - sentence = self.__process_text_input(sentence) - trees = self.parser.parse(sentence.split()) - list_trees = [tree for tree in trees] - if len(list_trees) == 0: - failed_sentences.append(sentence) - continue - trans_sentence = translate_trees_grammar(list_trees, self.src_to_tgt_grammar, self.src_to_tgt_dictionary) - translated_sentences.append(trans_sentence) - - # String to display failed sentence - failed_sentences = '\n'.join(failed_sentences) - - if len(failed_sentences) > 0: - raise ValueError(f"Please check your grammar again, failed to parse these sentences: \n{failed_sentences}") - - return translated_sentences \ No newline at end of file diff --git a/ruby/utils/__init__.py b/ruby/utils/__init__.py deleted file mode 100644 index 3e43ba2..0000000 --- a/ruby/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Utilities for tree manipulation.""" \ No newline at end of file diff --git a/ruby/utils/misc.py b/ruby/utils/misc.py deleted file mode 100644 index dd8384c..0000000 --- a/ruby/utils/misc.py +++ /dev/null @@ -1,4 +0,0 @@ -import re - -def remove_trailing_space(sentence): - return re.sub(' +', ' ', sentence) \ No newline at end of file diff --git a/ruby/utils/tree_manipulation.py b/ruby/utils/tree_manipulation.py deleted file mode 100644 index 5616a70..0000000 --- a/ruby/utils/tree_manipulation.py +++ /dev/null @@ -1,141 +0,0 @@ -import nltk -from nltk import ParentedTree as PTree -from typing import List -import random - -def tree_to_ptree(tree: nltk.Tree): - tree_str = tree.__str__() - ptree = PTree.fromstring(tree_str) - return ptree - -def swap_tree_given_left(left_tree: nltk.Tree, displacement: List[int], new_words= List[str]): - """Swap left node with right node within a parent node.""" - nodes = [left_tree] - right_tree = left_tree.right_sibling() - parent_tree = left_tree.parent() - # Get all tree pointer - for disp in displacement: - # disp = -1 indicates that is a new word, skip - if disp == -1: - continue - nodes.append(right_tree) - - right_tree = right_tree.right_sibling() - if right_tree == None: - break - - # Remove all siblings and left-most self - for node in nodes: - parent_tree.remove(node) - - # Append with new displacement - for disp in displacement: - # disp = -1 indicates that is a new word - if disp == -1: - new_word = PTree('NEW', [new_words.pop(0)]) - parent_tree.append(new_word) - else: - parent_tree.append(nodes[disp]) - - return parent_tree - - -def build_grammar_str_from_left_most(tree: nltk.Tree): - - left_pt = tree.left_sibling() - right_pt = tree.right_sibling() - parent_pt = tree.parent() - - grammar_str = None - - if parent_pt != None: - grammar_str = f"{parent_pt.label()} -> {tree.label()}" - - # Build grammar from leftmost node in the subtree - if left_pt == None and right_pt != None : - while right_pt != None: - grammar_str += f" {right_pt.label()}" - right_pt = right_pt.right_sibling() - return grammar_str - - -def translate_tree_grammar(tree: nltk.Tree, grammar_substitutions: dict): - """Translate tree grammar based on grammar substitution dict.""" - # Number of substitution done - num_subs = 0 - # Convert tree to ParentedTree - ptree = tree_to_ptree(tree) - old_num_subs = -1 - - # Loops until there no substitution left - while num_subs != old_num_subs: - old_num_subs = num_subs - # Traverse through subtrees - for sub in ptree.subtrees(): - # Create grammar string from left-most node. E.g: NP -> JJ NP, - # in this case, JJ is left-most node - grammar_str = build_grammar_str_from_left_most(sub) - for src_grammar, tgt_grammar in grammar_substitutions.items(): - if grammar_str == src_grammar: - # Increment number of substitutions - num_subs += 1 - # Calculate displacement between 2 grammar strings - disp, new_words = calculate_displacement(src_grammar,tgt_grammar) - # Change tree nodes positions thanks to new displacement - swap_tree_given_left(sub, disp, new_words) - - - translated_grammar_sentence = " ".join(ptree.leaves()) - return translated_grammar_sentence, num_subs - -def translate_sentence_words(sentence, src_to_tgt_dictionary): - words_list = [] - - for word in sentence.split(): - target_word = src_to_tgt_dictionary.get(word,word) - - if isinstance(target_word, list): - target_word = random.choice(target_word) - - words_list.append(target_word) - - return ' '.join(words_list) - -def translate_trees_grammar(list_trees: List[nltk.Tree], src_to_tgt_grammar, src_to_tgt_dictionary): - - # translated sentence map with number of grammar substitution found - trans_map = {} - - for tree in list_trees: - # Translate grammar - trans_gram_sentence, num_subs = translate_tree_grammar(tree, src_to_tgt_grammar) - - # Translate words - trans_lang_sentence = translate_sentence_words(trans_gram_sentence, src_to_tgt_dictionary) - - # Append to trans map - trans_map[trans_lang_sentence] = num_subs - # Return translation that has the most displacement - return max(trans_map, key=trans_map.get) - -def calculate_displacement(src_grammar, tgt_grammar): - """Calculate displacement between 2 grammar. E.g: S -> A B C to S -> B C A has displacement of [1 2 0]""" - src_grammar_lst = src_grammar.split() - tgt_grammar_lst = tgt_grammar.split() - - src_grammar_lst = src_grammar_lst[src_grammar_lst.index("->")+1:] - tgt_grammar_lst = tgt_grammar_lst[tgt_grammar_lst.index("->")+1:] - - displacement = [] - new_words = [] - - for word in tgt_grammar_lst: - try: - displacement.append(src_grammar_lst.index(word)) - except ValueError: - # Resolve ValueError: substring not found - # Which indicates this is a new word - displacement.append(-1) - new_words.append(word) - - return displacement, new_words \ No newline at end of file