From 4db4269edab8ac18fa21f8b51ce4b51dbe99011b Mon Sep 17 00:00:00 2001 From: Ankan Date: Sun, 3 Nov 2024 15:33:40 +0000 Subject: [PATCH] Added the GrammarAutoCorrector --- NLP/Grammar Auto Corrector/README.md | 42 ++++++++++++++++ NLP/Grammar Auto Corrector/main.py | 74 ++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 NLP/Grammar Auto Corrector/README.md create mode 100644 NLP/Grammar Auto Corrector/main.py diff --git a/NLP/Grammar Auto Corrector/README.md b/NLP/Grammar Auto Corrector/README.md new file mode 100644 index 000000000..84d997119 --- /dev/null +++ b/NLP/Grammar Auto Corrector/README.md @@ -0,0 +1,42 @@ +# Grammar AutoCorrector + +A powerful Grammar AutoCorrector tool designed to automatically detect and correct grammatical errors in English sentences. This project leverages NLP techniques and the T5 transformer model for advanced grammar correction, making it suitable for applications in writing assistance tools, educational platforms, and beyond. + +## Features +- Preprocessing techniques such as tokenization, lemmatization, stop word removal, and punctuation removal. +- Training of a grammar correction model using large datasets with grammatically correct sentences. +- Ability to identify and correct common grammatical errors in sentences. + + + +## This project uses the following modules: + +### Modules Used + +1. Transformers +2. Torch +3. NLTK +4. SpaCy +5. Pandas +6. NumPy +7. re (Regular Expressions) +8. Scikit-Learn +9. pytest +10. datasets (Hugging Face) +11. yaml +12. tqdm + + +## Data +Download appropriate grammar correction datasets, such as: +- [Cambridge English Write & Improve + LOCNESS](https://ilexir.co.uk/datasets/index.html) +- [Grammarly GEC Dataset](https://www.grammarly.com/research/grammatical-error-correction/) +- [JFLEG](https://github.com/keisks/jfleg) + + + +# Connect with Me + +- **GitHub**: [Peart-Guy](https://github.com/Peart-Guy) +- **LinkedIn**: [Ankan Mukhopadhyay](https://www.linkedin.com/in/ankan-mukhopadhyaypeartguy/) + diff --git a/NLP/Grammar Auto Corrector/main.py b/NLP/Grammar Auto Corrector/main.py new file mode 100644 index 000000000..2286ad8e9 --- /dev/null +++ b/NLP/Grammar Auto Corrector/main.py @@ -0,0 +1,74 @@ +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import WordNetLemmatizer +import re +from transformers import T5ForConditionalGeneration, T5Tokenizer +from transformers import Trainer, TrainingArguments +from datasets import load_dataset + +# Download NLTK resources +nltk.download('punkt') +nltk.download('stopwords') +nltk.download('wordnet') +nltk.download('averaged_perceptron_tagger') + +# Initialize the lemmatizer and stop words list +lemmatizer = WordNetLemmatizer() +stop_words = set(stopwords.words('english')) + +def preprocess_text(text): + # Lowercase the text + text = text.lower() + # Remove punctuation + text = re.sub(r'[^\w\s]', '', text) + # Tokenize text + tokens = word_tokenize(text) + # Lemmatize and remove stop words + tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] + return tokens + + +# Load pre-trained T5 model and tokenizer +model = T5ForConditionalGeneration.from_pretrained('t5-small') +tokenizer = T5Tokenizer.from_pretrained('t5-small') + +dataset = load_dataset("bookcorpus", split="train") # For BooksCorpus +wiki_dataset = load_dataset("wikipedia", "20220301.en", split="train") # For Wikipedia + +# Define a training function +def train_model(dataset): + # Tokenize inputs and outputs + inputs = tokenizer(["correct: " + text for text in dataset["input_texts"]], return_tensors="pt", padding=True) + outputs = tokenizer(["grammar_corrected: " + text for text in dataset["output_texts"]], return_tensors="pt", padding=True) + + # Define Trainer + training_args = TrainingArguments( + output_dir='./results', + per_device_train_batch_size=4, + num_train_epochs=3, + weight_decay=0.01, + ) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset + ) + + trainer.train() + +# Train the model on the processed dataset +train_model(dataset) + + + +def correct_grammar(text): + input_text = "correct: " + text + input_ids = tokenizer(input_text, return_tensors="pt").input_ids + outputs = model.generate(input_ids) + corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True) + return corrected_text + +# Example usage +test_sentence = "She go to the market every morning." +print("Corrected Sentence:", correct_grammar(test_sentence))