Bump stable version to 0.3.0 (#33)

* reconstruct code * show aff;fix latex replace bug; * remove pdb * checkout to specific ref * update readme * fix invalid tar * Retreive arxiv paper from Atom feed (#31) * retrieve from rss * fix bug * fix bug * fix bug * clean code * Release v0.3.0 (#32) * bump version to 0.3.0 * update readme * update uv.lock
TideDra · Dec 25, 2024 · 8f41ec0 · 8f41ec0
1 parent e36b35f
commit 8f41ec0
Show file tree

Hide file tree

Showing 10 changed files with 444 additions and 334 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ dist/
 wheels/
 .vscode/
 *.egg-info
+.env
 
 # Virtual environments
 .venv

diff --git a/README.md b/README.md
@@ -33,6 +33,7 @@
 ## ✨ Features
 - Totally free! All the calculation can be done in the Github Action runner locally within its quota (for public repo).
 - AI-generated TL;DR for you to quickly pick up target papers.
+- Affiliations of the paper are resolved and presented.
 - Links of PDF and code implementation (if any) presented in the e-mail.
 - List of papers sorted by relevance with your recent research interest.
 - Fast deployment via fork this repo and set environment variables in the Github Action Page.
@@ -56,7 +57,7 @@ Below are all the secrets you need to set. They are invisible to anyone includin
 | :--- | :---: | :---  | :---  | :--- |
 | ZOTERO_ID | ✅ | str  | User ID of your Zotero account. Get your ID from [here](https://www.zotero.org/settings/security). | 12345678  |
 | ZOTERO_KEY | ✅ | str  | An Zotero API key with read access. Get a key from [here](https://www.zotero.org/settings/security).  | AB5tZ877P2j7Sm2Mragq041H   |
-| ARXIV_QUERY | ✅ | str  | The search query for retrieving arxiv papers. Refer to the [official document](https://info.arxiv.org/help/api/user-manual.html#query_details) for details. The example queries papers about AI, CV, NLP, ML. Find the abbr of your research area from [here](https://arxiv.org/category_taxonomy).  | cat:cs.AI OR cat:cs.CV OR cat:cs.LG OR cat:cs.CL |
+| ARXIV_QUERY | ✅ | str  | The categories of target arxiv papers. Use `+` to concatenate multiple categories. The example retrieves papers about AI, CV, NLP, ML. Find the abbr of your research area from [here](https://arxiv.org/category_taxonomy).  | cs.AI+cs.CV+cs.LG+cs.CL |
 | SMTP_SERVER | ✅ | str | The SMTP server that sends the email. I recommend to utilize a seldom-used email for this. Ask your email provider (Gmail, QQ, Outlook, ...) for its SMTP server| smtp.qq.com |
 | SMTP_PORT | ✅ | int | The port of SMTP server. | 465 |
 | SENDER | ✅ | str | The email account of the SMTP server that sends you email. | [email protected] |
@@ -118,6 +119,9 @@ The TLDR of each paper is generated by a lightweight LLM (Qwen2.5-3b-instruct-q4
 - The recommendation algorithm is very simple, it may not accurately reflect your interest. Welcome better ideas for improving the algorithm!
 - This workflow deploys an LLM on the cpu of Github Action runner, and it takes about 70s to generate a TLDR for one paper. High `MAX_PAPER_NUM` can lead the execution time exceed the limitation of Github Action runner (6h per execution for public repo, and 2000 mins per month for private repo). Commonly, the quota given to public repo is definitely enough for individual use. If you have special requirements, you can deploy the workflow in your own server, or use a self-hosted Github Action runner, or pay for the exceeded execution time.
 
+## 👯‍♂️ Contribution
+Any issue and PR are welcomed! But remember that **each PR should merge to the `dev` branch**.
+
 ## 📃 License
 Distributed under the AGPLv3 License. See `LICENSE` for detail.
 

diff --git a/construct_email.py b/construct_email.py
@@ -1,5 +1,12 @@
-import arxiv
+from paper import ArxivPaper
 import math
+from tqdm import tqdm
+from email.header import Header
+from email.mime.text import MIMEText
+from email.utils import parseaddr, formataddr
+import smtplib
+import datetime
+
 framework = """
 <!DOCTYPE HTML>
 <html>
@@ -49,7 +56,7 @@ def get_empty_html():
   """
   return block_template
 
-def get_block_html(title:str, authors:str, rate:str,arxiv_id:str, abstract:str, pdf_url:str, code_url:str=None):
+def get_block_html(title:str, authors:str, rate:str,arxiv_id:str, abstract:str, pdf_url:str, code_url:str=None, affiliations:str=None):
     code = f'<a href="{code_url}" style="display: inline-block; text-decoration: none; font-size: 14px; font-weight: bold; color: #fff; background-color: #5bc0de; padding: 8px 16px; border-radius: 4px; margin-left: 8px;">Code</a>' if code_url else ''
     block_template = """
     <table border="0" cellpadding="0" cellspacing="0" width="100%" style="font-family: Arial, sans-serif; border: 1px solid #ddd; border-radius: 8px; padding: 16px; background-color: #f9f9f9;">
@@ -61,6 +68,8 @@ def get_block_html(title:str, authors:str, rate:str,arxiv_id:str, abstract:str,
     <tr>
         <td style="font-size: 14px; color: #666; padding: 8px 0;">
             {authors}
+            <br>
+            <i>{affiliations}</i>
         </td>
     </tr>
     <tr>
@@ -87,7 +96,7 @@ def get_block_html(title:str, authors:str, rate:str,arxiv_id:str, abstract:str,
     </tr>
 </table>
 """
-    return block_template.format(title=title, authors=authors,rate=rate,arxiv_id=arxiv_id, abstract=abstract, pdf_url=pdf_url, code=code)
+    return block_template.format(title=title, authors=authors,rate=rate,arxiv_id=arxiv_id, abstract=abstract, pdf_url=pdf_url, code=code, affiliations=affiliations)
 
 def get_stars(score:float):
     full_star = '<span class="full-star">⭐</span>'
@@ -106,25 +115,46 @@ def get_stars(score:float):
         return '<div class="star-wrapper">'+full_star * full_star_num + half_star * half_star_num + '</div>'
 
 
-def render_email(papers:list[arxiv.Result]):
+def render_email(papers:list[ArxivPaper]):
     parts = []
     if len(papers) == 0 :
         return framework.replace('__CONTENT__', get_empty_html())
 
-    for p in papers:
-        # crop the abstract
-        '''
-        summary = p.summary
-        summary = summary[:min(600, len(summary))]
-        if len(summary) == 600:
-            summary += '...'
-        '''
+    for p in tqdm(papers,desc='Rendering Email'):
         rate = get_stars(p.score)
         authors = [a.name for a in p.authors[:5]]
         authors = ', '.join(authors)
         if len(p.authors) > 5:
             authors += ', ...'
-        parts.append(get_block_html(p.title, authors,rate,p.arxiv_id ,p.tldr, p.pdf_url, p.code_url))
+        if p.affiliations is not None:
+            affiliations = p.affiliations[:5]
+            affiliations = ', '.join(affiliations)
+            if len(p.affiliations) > 5:
+                affiliations += ', ...'
+        else:
+            affiliations = 'Unknown Affiliation'
+        parts.append(get_block_html(p.title, authors,rate,p.arxiv_id ,p.tldr, p.pdf_url, p.code_url, affiliations))
 
     content = '<br>' + '</br><br>'.join(parts) + '</br>'
     return framework.replace('__CONTENT__', content)
+
+def send_email(sender:str, receiver:str, password:str,smtp_server:str,smtp_port:int, html:str,):
+    def _format_addr(s):
+        name, addr = parseaddr(s)
+        return formataddr((Header(name, 'utf-8').encode(), addr))
+
+    msg = MIMEText(html, 'html', 'utf-8')
+    msg['From'] = _format_addr('Github Action <%s>' % sender)
+    msg['To'] = _format_addr('You <%s>' % receiver)
+    today = datetime.datetime.now().strftime('%Y/%m/%d')
+    msg['Subject'] = Header(f'Daily arXiv {today}', 'utf-8').encode()
+
+    try:
+        server = smtplib.SMTP(smtp_server, smtp_port)
+        server.starttls()
+    except smtplib.SMTPServerDisconnected:
+        server = smtplib.SMTP_SSL(smtp_server, smtp_port)
+
+    server.login(sender, password)
+    server.sendmail(sender, [receiver], msg.as_string())
+    server.quit()
diff --git a/llm.py b/llm.py
@@ -0,0 +1,37 @@
+from llama_cpp import Llama
+from openai import OpenAI
+from loguru import logger
+
+GLOBAL_LLM = None
+
+class LLM:
+    def __init__(self, api_key: str = None, base_url: str = None, model: str = None):
+        if api_key:
+            self.llm = OpenAI(api_key=api_key, base_url=base_url)
+        else:
+            self.llm = Llama.from_pretrained(
+                repo_id="Qwen/Qwen2.5-3B-Instruct-GGUF",
+                filename="qwen2.5-3b-instruct-q4_k_m.gguf",
+                n_ctx=32_000,
+                n_threads=4,
+                verbose=False,
+            )
+        self.model = model
+
+    def generate(self, messages: list[dict]) -> str:
+        if isinstance(self.llm, OpenAI):
+            response = self.llm.chat.completions.create(messages=messages,temperature=0,model=self.model)
+            return response.choices[0].message.content
+        else:
+            response = self.llm.create_chat_completion(messages=messages,temperature=0)
+            return response["choices"][0]["message"]["content"]
+
+def set_global_llm(api_key: str = None, base_url: str = None, model: str = None):
+    global GLOBAL_LLM
+    GLOBAL_LLM = LLM(api_key=api_key, base_url=base_url, model=model)
+
+def get_llm() -> LLM:
+    if GLOBAL_LLM is None:
+        logger.info("No global LLM found, creating a default one. Use `set_global_llm` to set a custom one.")
+        set_global_llm()
+    return GLOBAL_LLM
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,7 @@ dist/ @@
     wheels/
     .vscode/
     *.egg-info
+    .env
     # Virtual environments
     .venv
@@ Expand Down @@