thaiconet.py

# -*- coding: utf-8 -*-
"""ThaiCoNet-Pipeline.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1sMkR-QSTYg6iBZ52tS8KkALs3IDTuHmA

# 🌐 ThaiCoNet : Thai Co-occurrence Network Analysis (Pipeline)

### 💡 About this notebook

🚧 pipeline_version : ```beta 0.0.6```

**[Access Project Repository (Github)](https://github.com/ChotanansubSoph/ThNTA)**

##### 🧑🏻‍💻 **Notebook Contributor**


*   Chotanansub Sophaken, [Github](https://github.com/ChotanansubSoph)
*   Kantapong Vongpanich, [Github](https://github.com/OnlyJust3rd)

  > 🏢 Department of Computer Eningeering, King Mongkut’s University of Technology Thonburi (KMUTT)

  > 🌱 Junior Science Talent Project by Siam Commercial Bank Scholarship (JSTP-SCB Scholarship)

---

##### **📚 Approach Based**

* *A. Takhom, D. Leenoi, C. Sophaken, P. Boonkwan, and T. Supnithi, “An Approach of Network Analysis Enhancing Knowledge Extraction in Thai Newspapers Contexts,” J. Intell. Informatics Smart Technol., vol. 6, no. October 2021, pp. 19–24, 2021 [Acess](https://jiist.aiat.or.th/assets/uploads/1635853027829tBupD1635602106085fdegH39.pdf)*

* Sophaken, C., Vongpanich, K., Takhom, A., Boonkwan, P., & Supnithi, T. (2023). Unsupervised Detection of Domain Switching in Thai Multidisciplinary Online News. IIAI Letters on Informatics and Interdisciplinary Research, 3. [Access](https://iaiai.org/letters/index.php/liir/article/view/77/50)

---

#### **🎓 Acknowledgement**
* Akkharawoot Takhom, PhD.
  
  > 🏢 Department of Electrical and Computer Engineering,
Thammasat University
---

### ⚙️ Tools & Resorces Preparation

Library & Module Installation

* 🏃 Run the code once.
* ⚠️ If you encounter any errors during the initial execution it may be due to factors such as dependencies or system configurations. To address any encountered errors, simply restart the runtime or kernel. Afterward, run the code cell again to ensure a successful execution.
"""

import subprocess
import sys
import pip

from google.colab import runtime
from tqdm.notebook import tqdm_notebook as tqdm


REQUIRED_MODULES = {
    'upgrade-pip' : ['pip', 'install', '--upgrade', 'pip'],
    'wheel' : ['pip', 'install', '--upgrade', 'setuptools', 'wheel'],
    'tltk' : ['pip', 'install', 'tltk==1.6.8', '-q'],
    'longan' : ['pip', 'install', 'longan','--extra-index-url', 'https://installer:glpat-dDG6MBuvUjUKWymz5uBu@gitlab.com/api/v4/projects/35051317/packages/pypi/simple'],
    'deepcut' : ['pip', 'install', 'deepcut==0.7.0.0', '-q'],
    'pythainlp' : ['pip', 'install', 'pythainlp==4.0.2', '-q'],
    'pyvis' : ['pip', 'install', 'pyvis==0.1.9', '-q'],
    'graphviz' : ['apt-get', 'install', '-y', 'graphviz', 'libgraphviz-dev', 'pkg-config', '-q'],
    'pygraphviz' : ['pip', 'install', 'pygraphviz==1.7', '-q'],
}

def str_color(text, color):
    colors = {'black': '\033[30m','red': '\033[31m', 'green': '\033[32m','yellow': '\033[33m','blue': '\033[34m', 'purple': '\033[35m','cyan': '\033[36m','white': '\033[37m', 'reset': '\033[0m'}
    return f"{colors[color]}{text}{colors['reset']}"

def install_colab_packages(lib_dicts,excepts=[]):
    lib_lists  = [lib for lib in list(lib_dicts.keys()) if lib not in excepts]
    for lib in tqdm(lib_lists):
      try:
         subprocess.check_call(REQUIRED_MODULES[lib])
         print(f'{str_color("[ThaiCoNet]","yellow")} {lib} : {str_color("Downloaded","green")} ☑️ ')
      except:
        print(f"\n{'-'*70}\n{str_color('[ThaiCoNet]', 'yellow')} {lib} : ⚠️ {str_color('failed to download!', 'red')}\n🔄 please reconnection and try again\n{'-'*70}")
        print('\n')
        runtime.unassign()

def setup(notebook = "colab", excepts=[]):
    if notebook == "colab":
        install_colab_packages(REQUIRED_MODULES,excepts)

    #Data manipulation
    global pd,np
    import pandas as pd
    import numpy as np

    #NLP
    global nltk,tltk,longan,deepcut
    global FreqDist,bigrams,term_tokenize
    global pythainlp_word_tokenize, pythainlp_pos_tag, pythainlp_stopwords
    global Dictionary, TfidfModel
    import nltk
    from nltk import FreqDist, bigrams
    from nltk.tokenize import word_tokenize as term_tokenize
    nltk.download('punkt')

    if 'tltk' not in excepts:
      import tltk
    if 'pythainlp' not in excepts:
      from pythainlp import word_tokenize as pythainlp_word_tokenize
      from pythainlp import pos_tag as pythainlp_pos_tag
      from pythainlp.corpus.common import thai_stopwords as pythainlp_stopwords
    if 'longan' not in excepts:
      import longan
    if 'deepcut' not in excepts:
      import deepcut

    from gensim.corpora import Dictionary
    from gensim.models import TfidfModel

    #Graph Visulazation
    global nx, plt, Network, display, HTML
    import networkx as nx
    import matplotlib.pyplot as plt
    from pyvis.network import Network
    from IPython.display import display, HTML

    #Add-on
    global itemgetter, Counter, defaultdict, islic, re, os, requests
    from operator import itemgetter
    from collections import Counter,defaultdict
    from itertools import islice
    import re
    import os
    import requests

    print(f'\n{"-"*70}\n{str_color("[ThaiCoNet]", "yellow")} All required packages {str_color("have completely installed", "green")} ✅ \n{"-"*70}')


if __name__ == "__main__" and 'ipykernel' in sys.modules:
  __is_ipython_kernel__ = True
else:
  __is_ipython_kernel__ = False

def __sample_setup__():
  if __is_ipython_kernel__:
    setup()
    import requests

__sample_setup__()

"""Sample Resources prepararion"""

#sample data
def download_data(url, file_name=None):
    if file_name is None:
        file_name = url.split('/')[-1]
    response = requests.get(url)
    with open(file_name, "wb") as file:
        file.write(response.content)

def notebook_download_sample_data():
  if __is_ipython_kernel__:
      url = "https://github.com/ChotanansubSoph/ThNTA/raw/main/resources/sample_data/thai_electronic_news_2022.csv"
      download_data(url=url)

notebook_download_sample_data()

"""### 🤖 Methods"""

########## String operation ##########
def isEnglish(s):
  return all(ord(char) < 128 for char in s)


########## List Manipulation ##########
def flatten_nested_list(nested_list):
  flattened_list = [item for sublist in nested_list for item in sublist]
  return flattened_list


######### DataFrame Manipulation #######
def convert_dataframe_to_paired_tuples(df):
    return list(zip(df.iloc[:, 0].tolist(), df.iloc[:, 1].tolist()))

########## Stopwords ##########
def read_stopwords(file_path : str) ->list:
  with open(file_path, 'r', encoding='utf-8') as file:
      lines = file.readlines()
  stopwords = [line.strip() for line in lines]
  return stopwords

########## Tokenization ##########
def pythainlp_tokenize_pos(text): #Secondary Tokenizer
  wordList= pythainlp_word_tokenize(text, keep_whitespace=False)
  posList = pythainlp_pos_tag(wordList)
  return posList


def TNC_extract_tltk_pos_pairs(result): #Inactivated
    word_pos_pairs = []
    pattern = r'<w tran="(.*?)" POS="(.*?)">(.*?)</w>'
    matches = re.findall(pattern, result)

    for match in matches:
        word_pos_pairs.append((match[2], match[1]))

    return word_pos_pairs

# def TNC_tokenize_pos_ner_(text): #Inactivated
#   result = []
#   for partial_text in text.split(" "):
#     partial_text = partial_text.replace(")"," ").replace("("," ")
#     result += tltk.nlp.TNC_tag(partial_text,POS="Y")
#   return tltk.nlp.ner(TNC_extract_tltk_pos_pairs(result))


########## Term Frequency ##########
def count_word_frequency(pairs_data):
    words = [word for sublist in pairs_data for word in sublist]
    tokens = term_tokenize(" ".join(words))
    freq_dist = FreqDist(tokens)
    return freq_dist


def count_word_pos_frequency(data):
    flat_data = [item for sublist in data for item in sublist]
    word_pos_freq = Counter(flat_data)
    result = [(word, pos, freq) for (word, pos), freq in word_pos_freq.items()]
    result.sort(key=lambda x: x[2], reverse=True)
    return result

"""### 📝 Text preprocess

Tokenization
"""

def text_tokenize(text: str,tokenizer="pythainlp") -> list:
    term_list = list()
    if tokenizer == "tltk" or tokenizer == "tltk-colloc":
      term_list = tltk.nlp.word_segment(text).split("|")
    elif tokenizer == "tltk-mm" or tokenizer == "tltk-ngram":
      term_list = tltk.nlp.word_segment(text, method="mm").split("|")
    elif tokenizer == "tltk-w2v":
      term_list = tltk.nlp.word_segment(text, method="w2v").split("|")

    elif tokenizer == "pythainlp" or tokenizer == "newmm":
      term_list = pythainlp_word_tokenize(text)

    elif tokenizer == "deepcut":
      term_list = deepcut.tokenize(text)

    elif tokenizer =="longan":
      term_list = longan.tokenize(text)

    return term_list

def __test_tokenize__():
  if __is_ipython_kernel__:
    sample_text = "ประกาศให้มีการสวมหน้ากากอนามัยตลอดเวลา"
    print("Tokenizer Test\n","-"*60)
    print("tltk ",text_tokenize(sample_text,"tltk"))
    print("tltk-mm ",text_tokenize(sample_text,"tltk-mm"))
    print("tltk-w2v ",text_tokenize(sample_text,"tltk-w2v"))
    print("pythainlp (newmm) ",text_tokenize(sample_text,"pythainlp"))
    print("deepcut ",text_tokenize(sample_text,"deepcut"))
    print("longan ",text_tokenize(sample_text,"longan"))

__test_tokenize__()

"""POS Tagging"""

def pos_tagging(term_list, pos_tagger):
  term_pairs = list()
  if pos_tagger == "tltk" or pos_tagger == "tltk-pos-tagger":
    term_pairs =  tltk.pos_tag_wordlist(term_list)
  elif pos_tagger == "pythainlp" or pos_tagger == "pythainlp-pos-tagger":
    term_pairs = pythainlp_pos_tag(term_list)
  elif pos_tagger == "pythainlp-pud":
    term_pairs = pythainlp_pos_tag(term_list,corpus="pud")
  elif pos_tagger == "longan":
    pos_list = longan.pos(term_list)
    term_pairs = list(map(lambda term, pos: (term,pos), term_list, pos_list))

  return term_pairs

def __sample_pos_tagging__():
    if __is_ipython_kernel__:
      print("POS Tagging Test\n","-"*60)
      print(pos_tagging(['ประกาศ', 'ให้', 'มี', 'การ', 'สวม', 'หน้ากาก', 'อนามัย', 'ตลอด', 'เวลา', '<s/>'],"tltk"))
__sample_pos_tagging__()

"""Token Filtering"""

def token_filter(pos_pairs: list, stopwords: set, keep_pos=[]):
  stopwords = []
  regex = re.compile('[@_!#$%^&*()<>?/\|}{~:.]')

  if len(keep_pos) > 0:
    pos_condition = lambda pos: True if pos in keep_pos else False
  else:
    pos_condition = lambda pos:True


  filtered_pairs = [(term,pos) for term, pos in pos_pairs
              if pos_condition(pos)
              and term not in stopwords
              and len(term) > 1
              and not isEnglish(term)
              and regex.search(term) is None
              and "\xa0" not in term]

  return filtered_pairs

"""Warp-up Token prepairation process"""

def feed_preprocess(docs: list, stopwords = None, tokenizer="deepcut",pos_tagger = "tltk",keep_pos=['NOUN','VERB'], is_filter = True, is_pos = True, is_tokenize=True) -> list:
    preprocessed_docs = []

    if stopwords is None:
      stopwords = pythainlp_stopwords()

    for text in tqdm(docs):

        if tokenizer=="pythainlp" and pos_tagger=="pythainlp":
          pos_pairs = pythainlp_tokenize_pos(text)

        else:

          #Tokenization
          term_list = text
          if is_tokenize:
            term_list = text_tokenize(
                          text=term_list,
                          tokenizer=tokenizer,
                        )

          #POS Tagger
          pos_pairs = term_list
          if is_pos:
            pos_pairs = pos_tagging(pos_pairs,pos_tagger)

          #Token Filtering
          preprocessed_terms = pos_pairs
          if is_filter:
            preprocessed_terms = token_filter(pos_pairs = preprocessed_terms,
                                            stopwords = stopwords,
                                            keep_pos = keep_pos
                                            )

          preprocessed_docs.append(preprocessed_terms)

    return preprocessed_docs

"""Tokenization & Token Filtering Demonstration"""

def __sample_load_data__():
  if __is_ipython_kernel__:
    global sample_data
    sample_data = pd.read_csv("thai_electronic_news_2022.csv")
    display(sample_data)

__sample_load_data__()

def __sample_feed_process__():
  if __is_ipython_kernel__:
    global sample_tokenized_data
    sample_tokenized_data = feed_preprocess(docs=sample_data["content"],
                                    tokenizer="pythainlp",
                                    pos_tagger="tltk-pos-tagger",
                                    keep_pos=[])
    #approximate time : pythainlp ~2.17 min / tltk ~ min /longan
__sample_feed_process__()

def __sample_show_tokenized_data__():
  if __is_ipython_kernel__:
    display(sample_tokenized_data[0][:10])
    display(sample_tokenized_data[1][:10])

__sample_show_tokenized_data__()

def flat_pos_list(pos_list):
  result = list()
  for doc in pos_list:
    collect = []
    for pair in doc:
      collect.append(pair[0]+"|"+pair[1])
    result.append(collect)
  return result

def __sample_show_flated_pos_list__():
  if __is_ipython_kernel__:
    global flated_pos_list
    flated_pos_list = flat_pos_list(sample_tokenized_data[:2])
    display(flated_pos_list[0][:15])

__sample_show_flated_pos_list__()

def calculate_tfidf(documents):
    # Create a dictionary from the documents
    dictionary = Dictionary(documents)

    # Create a bag-of-words representation for the documents
    corpus = [dictionary.doc2bow(doc) for doc in documents]

    # Create a TF-IDF model from the bag-of-words corpus
    tfidf_model = TfidfModel(corpus)

    # Compute TF-IDF values for the documents
    tfidf_values = [tfidf_model[doc] for doc in corpus]

    # Convert TF-IDF values to dictionaries
    tfidf_list = []
    for doc in tfidf_values:
        tfidf_dict = {dictionary[id]: value for id, value in doc}
        tfidf_list.append(tfidf_dict)

    return tfidf_list

def __sample_show_tfidf_pairs__():
  if __is_ipython_kernel__:
    global tfidf_pairs
    tfidf_pairs = calculate_tfidf(flated_pos_list)
    display(dict(list(islice(tfidf_pairs[0].items(), 15))))

__sample_show_tfidf_pairs__()

def __sample_freq_detection__():
  if __is_ipython_kernel__:
    global sample_tokenized_freq
    sample_tokenized_freq = count_word_pos_frequency(sample_tokenized_data)
    display(sample_tokenized_freq[:15])

__sample_freq_detection__()

"""generate bag of co-occurence terminology"""

def generate_bigram_freq(term_list)->list:
    bigram_list = []

    for word_list in term_list:
        try:
            bigrams_list = list(bigrams(word_list))
            bigram_list.extend(bigrams_list)
        except:
            continue

    frequency_dist = FreqDist(bigram_list)
    bigram_freq = sorted(frequency_dist.items(), key=itemgetter(1), reverse=True)

    return bigram_freq


def generate_trigrams(data):
  result = []
  for doc in data:
    collect=[]
    for i in range(len(doc[:-2])):
      triple = (doc[i],doc[i+1],doc[i+2])
      collect.append(triple)
    result.append(collect)
  return result


def count_triple_frequency(docs):
    triples = [word for sublist in docs for word in sublist]
    triple_frequency = Counter(triples)
    result = list(triple_frequency.items())
    return  sorted(result, key=lambda x: x[1], reverse=True)

def __sample_gen_trigrams__():
  if __is_ipython_kernel__:
    global sample_cooc_data
    sample_cooc_data = generate_trigrams(sample_tokenized_data)
    display(sample_cooc_data[0][:20])

__sample_gen_trigrams__()

def flat_triple_doc(triple_docs):
  result=[]
  for doc in triple_docs:
    collect = []
    for triple in doc:
      collect.append('|'.join([f'{token}|{pos}' for token, pos in triple]))
    result.append(collect)
  return result

def __sample__show_flat_triple_doc__():
  if __is_ipython_kernel__:
   display(flat_triple_doc(sample_cooc_data[:1])[0][:15])

__sample__show_flat_triple_doc__()

def __sample__show_triple_tfidf_list__():
  if __is_ipython_kernel__:
    global triple_tfidf_list
    triple_tfidf_list = calculate_tfidf(flat_triple_doc(sample_cooc_data))
    display(triple_tfidf_list[0])

__sample__show_triple_tfidf_list__()

def __sample_cooc_freqs_detection__():
  if __is_ipython_kernel__:
    global sample_cooc_freqs
    sample_cooc_freqs = count_triple_frequency(sample_cooc_data)
    display(sample_cooc_freqs[:10])

__sample_cooc_freqs_detection__()

def filter_pos_triples(data,keeps = ('NOUN','VERB','NOUN')):
    filtered_triples = [ pair for pair in data
        if pair[0][0][1] == keeps[0] and pair[0][1][1] == keeps[1] and pair[0][2][1] == keeps[2]
    ]
    return filtered_triples

def __sample_filter_pos__():
  if __is_ipython_kernel__:
    global sample_filtered_cooc
    sample_filtered_cooc = filter_pos_triples(sample_cooc_freqs)
    display(sample_filtered_cooc[:20])

__sample_filter_pos__()

def bgs_filter_extreme(bgs_list, min_percent=0.05, max_percent=0.8):
  result = list()
  bgs_list = sorted(bgs_list, key=itemgetter(1), reverse=True)
  most_freq = bgs_list[0][1]
  max_freq = most_freq * max_percent
  min_freq = most_freq * min_percent

  result = [(pair, count) for pair, count in bgs_list if min_freq <= count <= max_freq and pair[0] != pair[1]]
  return result

"""## 🔮 Visualization"""

def visualize_cooccurrence(data, file_name="thaiconet_result.html"):
    # Phase 1: NetworkX
    G = nx.Graph()

    # Create a dictionary to store the degree of each node
    node_degrees = {}

    for triples, freq in data:
        sbj = triples[0][0]
        pred = triples[1][0]
        obj = triples[2][0]

        # Update the degree of term1
        node_degrees[sbj] = node_degrees.get(sbj, 0) + 1

        # Update the degree of term2
        node_degrees[obj] = node_degrees.get(obj, 0) + 1

        # Add nodes and set their size based on the degree
        G.add_node(sbj, size=min(node_degrees[sbj] * 5, 30), node_type = "sbj")
        G.add_node(obj, size=min(node_degrees[obj] * 5, 30), node_type = "obj")

        # Add edge
        G.add_edge(sbj, obj, weight=freq, label=pred, sbj_verifier = sbj)

    # pos = nx.spring_layout(G)
    # sizes = [G.nodes[n]['size'] for n in G.nodes()]

    # Phase 2: PyviZ
    net = Network(height="1000px",
                  width="100%",
                  notebook=True,
                  directed =True,
                 )

    for node in G.nodes():
        net.add_node(node, size=G.nodes[node]['size'])


    #display(G.edges(data=True))

    for u, v, data in G.edges(data=True):
        weight = data['weight']
        label = data['label']
        sbj_verifier = data['sbj_verifier']

        color = "orange" if weight > 45 else "gray"

        if u != sbj_verifier:
          u,v = v,u

        net.add_edge(u, v, value=weight, color=color, label=label)

    net.show_buttons(filter_=['physics'])
    net.show(file_name)

def __sample__visualization__():
  if __is_ipython_kernel__:

    custom_stop_words = ['การ']
    selected = []
    for triple in list(sample_filtered_cooc):

      is_valid = True
      for pairs in triple[0]:
        if pairs[0] in custom_stop_words:
          is_valid = False
          break
      if is_valid:
        selected.append(triple)

    visualize_cooccurrence(selected[:200])
    display(HTML("thaiconet_result.html"))

__sample__visualization__()