Skip to content

Commit

Permalink
אימון מחודש, 64 + 0.3, לוג משופר
Browse files Browse the repository at this point in the history
  • Loading branch information
NHLOCAL committed Oct 22, 2024
1 parent fe78a40 commit ecf147e
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 84 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/update_Training.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ on:
- machine-learn
paths:
- 'machine-learn/scrape_data/cleaned_new-data.json'
- 'machine-learn/creating_model_git.py'

jobs:
train-ner-model:
Expand All @@ -18,11 +19,11 @@ jobs:
- name: Set up Python environment
uses: actions/setup-python@v5
with:
python-version: '3.10'
python-version: '3.11'

- name: Install dependencies
run: |
pip install spacy==3.7.5 pandas
pip install spacy==3.7.5 pandas tqdm
- name: Set working directory
run: cd machine-learn/
Expand Down
182 changes: 101 additions & 81 deletions machine-learn/creating_model_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@
from spacy.util import minibatch, compounding
import json
import random
import logging
from tqdm import tqdm

# הגדרת הלוגר
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)

def custom_tokenizer(nlp):
default_tokenizer = Tokenizer(nlp.vocab)
Expand Down Expand Up @@ -40,87 +52,95 @@ def custom_tokenizer(nlp):

return nlp2.tokenizer

nlp = spacy.blank("he")
nlp.tokenizer = custom_tokenizer(nlp)

test_text = "תומר כהן- הישראלי הבכיר בלינקדין"
doc = nlp(test_text)
print([token.text for token in doc])

ner = nlp.add_pipe("ner")
ner.add_label("SINGER")

json_files = [
'/home/runner/work/Singles-Sorter/Singles-Sorter/machine-learn/scrape_data/cleaned_new-data.json'
]

training_data = []
for json_file in json_files:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
for example_text, example_entities in data:
entities = example_entities.get('entities', [])
example = Example.from_dict(nlp.make_doc(example_text), {'entities': entities})
training_data.append(example)


nlp.begin_training()

patience = 5
min_delta = 0.01
best_loss = float('inf')
patience_counter = 0
best_model_path = "/home/runner/work/Singles-Sorter/Singles-Sorter/machine-learn/best_model"
n_iter = 100
# batch_sizes = compounding(16.0, 64.0, 1.001)
batch_size = 32
drop_size = 0.5
iteration_data = {}
#initial_lr = 0.001 # שיעור למידה התחלתי
#lr_decay = 0.95 # קצב דעיכת שיעור הלמידה
# optimizer.learn_rate = initial_lr

for itn in range(n_iter):
random.shuffle(training_data)
losses = {}
for i in range(0, len(training_data), batch_size):
batch = training_data[i:i + batch_size]
nlp.update(batch, drop=drop_size, losses=losses)
print(f"Iteration {itn}: {losses}")
iteration_data[itn] = losses.copy()

current_loss = losses.get('ner', float('inf'))
if current_loss < best_loss - min_delta:
best_loss = current_loss
def main():
try:
nlp = spacy.blank("he")
nlp.tokenizer = custom_tokenizer(nlp)

test_text = "תומר כהן- הישראלי הבכיר בלינקדין"
doc = nlp(test_text)
logger.info(f"Tokenized Text: {[token.text for token in doc]}")

ner = nlp.add_pipe("ner")
ner.add_label("SINGER")

json_files = [
'/home/runner/work/Singles-Sorter/Singles-Sorter/machine-learn/scrape_data/cleaned_new-data.json'
]

training_data = []
for json_file in json_files:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
for example_text, example_entities in data:
entities = example_entities.get('entities', [])
example = Example.from_dict(nlp.make_doc(example_text), {'entities': entities})
training_data.append(example)

logger.info(f"Loaded {len(training_data)} training examples.")

nlp.begin_training()

patience = 5
min_delta = 0.1
best_loss = float('inf')
patience_counter = 0
# Save the best model
nlp.to_disk(best_model_path)
else:
patience_counter += 1
best_model_path = "/home/runner/work/Singles-Sorter/Singles-Sorter/machine-learn/best_model"
n_iter = 60
batch_size = 64
drop_size = 0.3
iteration_data = {}

logger.info("Starting training...")

for itn in tqdm(range(n_iter), desc="Training Iterations"):
random.shuffle(training_data)
losses = {}
for i in range(0, len(training_data), batch_size):
batch = training_data[i:i + batch_size]
nlp.update(batch, drop=drop_size, losses=losses)
logger.info(f"Iteration {itn}: {losses}")
iteration_data[itn] = losses.copy()

current_loss = losses.get('ner', float('inf'))
if current_loss < best_loss - min_delta:
best_loss = current_loss
patience_counter = 0
# שמירת המודל הטוב ביותר
nlp.to_disk(best_model_path)
logger.info(f"New best model found at iteration {itn} with loss {current_loss}. Saved to {best_model_path}.")
else:
patience_counter += 1
logger.info(f"No improvement in iteration {itn}. Patience counter: {patience_counter}/{patience}")

if patience_counter >= patience:
logger.info(f"Early stopping at iteration {itn} due to no improvement.")
break

with open("/home/runner/work/Singles-Sorter/Singles-Sorter/machine-learn/model_name.txt", 'r', encoding='utf-8') as f:
model_name = f.read().strip()
logger.info(f'Final Model Name: {model_name}')

try:
with open('/home/runner/work/Singles-Sorter/Singles-Sorter/machine-learn/iteration_data.json', 'w', encoding='utf-8') as f:
json.dump(iteration_data, f, ensure_ascii=False, indent=2)
logger.info("Saved iteration data to JSON file.")
except Exception as e:
logger.error(f'Error saving iteration data to JSON file: {e}')

# טעינת המודל הטוב ביותר לפני השמירה עם השם הסופי
nlp = spacy.load(best_model_path)
nlp.meta['name'] = 'singer_ner_he'
nlp.meta['description'] = 'Model for recognizing singer names in Hebrew song titles'
nlp.meta['author'] = 'nhlocal'
nlp.meta['email'] = '[email protected]'
nlp.meta['license'] = 'MIT'
nlp.meta['tags'] = ['NER', 'Hebrew', 'Singer', 'Named Entity Recognition', 'Text Classification']
nlp.to_disk(model_name)
logger.info(f"Saved final model to {model_name}.")

if patience_counter >= patience:
print(f"Early stopping at iteration {itn}")
break

# עדכון שיעור הלמידה
#optimizer.learn_rate *= lr_decay

with open("/home/runner/work/Singles-Sorter/Singles-Sorter/machine-learn/model_name.txt", 'r', encoding='utf-8') as f:
model_name = f.read()
print(f'# {model_name}')

try:
with open(f'/home/runner/work/Singles-Sorter/Singles-Sorter/machine-learn/iteration_data.json', 'w', encoding='utf-8') as f:
json.dump(iteration_data, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f'was error in Save iteration data to a JSON file: {e}')
except Exception as e:
logger.error(f"An error occurred during training: {e}")

# Load the best model before saving with the final name
nlp = spacy.load(best_model_path)
nlp.meta['name'] = 'singer_ner_he'
nlp.meta['description'] = 'Model for recognizing singer names in Hebrew song titles'
nlp.meta['author'] = 'nhlocal'
nlp.meta['email'] = '[email protected]'
nlp.meta['license'] = 'MIT'
nlp.meta['tags'] = ['NER', 'Hebrew', 'Singer', 'Named Entity Recognition', 'Text Classification']
nlp.to_disk(model_name)
if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion machine-learn/model_name.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
custom_ner_model23git
custom_ner_model27-1git

0 comments on commit ecf147e

Please sign in to comment.