Automate blog upload process (#333)

superlinked · Apr 3, 2024 · cf5472e · cf5472e
1 parent a160a5f
commit cf5472e
Show file tree

Hide file tree

Showing 7 changed files with 356 additions and 0 deletions.
diff --git a/.github/workflows/upload-blogs.yml b/.github/workflows/upload-blogs.yml
@@ -0,0 +1,22 @@
+name: Upload blogs to Strapi
+
+on:
+  push:
+    branches:
+      - vh-redesign
+
+jobs:
+  run-script:
+    if: ${{ github.repository == 'superlinked/VectorHub' }}
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.x
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        pip install -r blog/requirements.txt
+    - name: Run script
+      run: python blog/main.py --directories blog/directories.json
diff --git a/blog/.gitignore b/blog/.gitignore
@@ -0,0 +1,8 @@
+.env
+.venv
+./env/
+./venv/
+ENV/
+env.bak/
+venv.bak/
+__pycache__/
diff --git a/blog/README.md b/blog/README.md
@@ -0,0 +1,17 @@
+# Blog Uploads
+
+## Setup
+```bash
+cd blog
+python3 -m venv env
+source env/bin/activate
+pip install -r requirements.txt
+```
+
+## Run
+```bash
+export STRAPI_URL="https://mystrapi.strapiapp.com"
+export STRAPI_API_KEY="mystrapi-api-key"
+source blog/env/bin/activate
+python blog/main.py --directories blog/directories.json
+```
diff --git a/blog/directories.json b/blog/directories.json
@@ -0,0 +1,42 @@
+[
+    {
+        "type": "folder",
+        "name": "VectorHub",
+        "path": "docs",
+        "has_blogs": false,
+        "children": [
+            {
+                "type": "folder",
+                "name": "Building Blocks",
+                "path": "building_blocks",
+                "has_blogs": false,
+                "children": [
+                    {
+                        "type": "folder",
+                        "name": "Vector Compute",
+                        "path": "vector_compute",
+                        "has_blogs": true
+                    },
+                    {
+                        "type": "folder",
+                        "name": "Vector Search",
+                        "path": "vector_search",
+                        "has_blogs": true
+                    },
+                    {
+                        "type": "folder",
+                        "name": "Data Sources",
+                        "path": "data_sources",
+                        "has_blogs": true
+                    }
+                ]
+            },
+            {
+                "type": "folder",
+                "name": "Use Cases",
+                "path": "use_cases",
+                "has_blogs": true
+            }
+        ]
+    }
+]
diff --git a/blog/helpers.py b/blog/helpers.py
@@ -0,0 +1,88 @@
+import os
+import enum
+
+GIT_REPO = 'https://github.com/superlinked/VectorHub'
+
+class ItemType(enum.Enum):
+    FOLDER = "folder"
+    FILE = "file"
+
+class Item:
+    def __init__(self, type, name, path, has_blogs=False, children=None):
+        self.type = type
+        self.name = name
+        self.path = path
+        self.has_blogs = has_blogs
+        self.children = []
+
+        if children:
+            self.add_children(children)
+
+    def __str__(self) -> str:
+        return self.path
+
+    def add_children(self, children):
+        for child in children:
+            # Recursively create DirectoryItem objects for children
+            self.children.append(Item.from_dict(child))
+
+    @classmethod
+    def from_dict(cls, data):
+        """
+        Create an Item instance from a dictionary.
+        """
+        return cls(
+            type=data.get("type", ItemType.FOLDER),
+            name=data.get("name", ""),
+            path=data.get("path", ""),
+            has_blogs=data.get("has_blogs", False),
+            children=data.get("children", [])
+        )
+
+    def to_dict(self):
+        """
+        Convert the Item object to a dictionary.
+        """
+        return {
+            "type": self.type,
+            "path": self.path,
+            "name": self.name,
+            "has_blogs": self.has_blogs,
+            "children": self.children,
+        }
+
+
+class StrapiBlog:
+    def __init__(self, content, filepath, last_updated):
+        self.content = content
+        self.filepath = filepath
+        self.last_updated = last_updated
+
+    def get_title(self) -> str:
+        return os.path.basename(self.filepath).replace('-', ' ').replace('_', ' ').replace('.md', '')
+
+    def __str__(self) -> str:
+        return self.get_title()
+
+    def get_github_url(self):
+        return f'{GIT_REPO}/blob/main/{self.filepath}'
+
+
+    def get_slug(self):
+        slug = self.filepath.replace('.md', '').replace('_', '-').replace(' ', '-').replace('docs/', '').replace('&', '').replace('--', '-')
+        return slug.lower()
+
+    def get_json(self):
+        return {
+        "github_url": self.get_github_url(),
+        "content": self.content,
+        "github_last_updated_date": self.last_updated,
+        "title": self.get_title(),
+        "slug_url": self.get_slug()
+    }
+
+    def get_post_json(self):
+        return {"data": self.get_json()}
+
+    def __eq__(self, __value) -> bool:
+        self.get_slug() == __value.get_slug()
diff --git a/blog/main.py b/blog/main.py
@@ -0,0 +1,173 @@
+import os
+import json
+import argparse
+from json.decoder import JSONDecodeError
+import requests
+from urllib.parse import urljoin
+from helpers import Item, ItemType, StrapiBlog
+from tqdm.auto import tqdm
+from datetime import datetime
+from pathlib import Path
+
+args = None
+
+BASE_URL = os.getenv('STRAPI_URL', "")
+API_KEY = os.getenv('STRAPI_API_KEY', "")
+
+paths_to_search = []
+existing_slugs_discovered = {}
+
+headers = {
+    'Authorization': f'Bearer {API_KEY}',
+    'Content-Type': 'application/json'
+}
+
+def arg_parse():
+    global args
+    parser = argparse.ArgumentParser(description="VectorHub Strapi Upload")
+    parser.add_argument('--directories', help='Path to json which describes the directories to parse')
+    args = parser.parse_args()
+
+def load_items_from_json(directories: str) -> list:
+    if os.path.exists(directories):
+        items = []
+        try:
+            with open(directories, 'r') as file:
+                data = json.load(file)
+                for item_data in data:
+                    items.append(Item.from_dict(item_data))
+        except JSONDecodeError as e:
+            print('JSON Structure is invalid.')
+            exit(1)
+        except Exception as e:
+            print('Unknown error occured.')
+            print(e)
+            exit(1)
+        return items
+    else:
+        print(f"{directories} does not exist.")
+        exit(1)
+
+
+def load_existing_blogs(page_num=1):
+    global existing_slugs_discovered
+    base_url = urljoin(BASE_URL, 'api/blogs')
+    search_url = base_url + f"?pagination[page]={page_num}"
+
+    session = requests.Session()
+
+    response = session.get(search_url, headers=headers)
+    if response.status_code == 200:
+        data = json.loads(response.text)['data']
+        if len(data) > 0:
+            for item in data:
+                existing_slugs_discovered[item['attributes']['slug_url']] = {'discovered': False, 'id': -1}
+            load_existing_blogs(page_num+1)
+
+
+def fetch_paths(node: Item, current_path=""):
+    global paths_to_search
+    # Update the current path with dthe node's path
+    current_path = f"{current_path}/{node.path}" if current_path else node.path
+
+    # If the node has children, recurse on each child
+    if node.has_blogs:
+        paths_to_search.append(current_path)
+    if node.children and len(node.children) > 0:
+        for child in node.children:
+            fetch_paths(child, current_path)
+
+
+def find_files_to_upload(items: list):
+    global paths_to_search
+
+    for item in items:
+        fetch_paths(item)
+
+    files = []
+
+    extension = 'md'
+
+    for path in paths_to_search:
+        folder_path = Path(path)
+        folder_files = folder_path.glob(f"*.{extension}")
+        for file in folder_files:
+            if 'readme.md' not in str(file).lower():
+                files.append({
+                    'path': str(file),
+                    'time': datetime.fromtimestamp(os.path.getmtime(file)).strftime("%Y-%m-%d")
+                })
+
+    return files
+
+
+def build_blog_object(file_obj: dict) -> StrapiBlog:
+    filepath = file_obj['path']
+    with open(filepath, 'r') as file:
+        content = file.read()
+        blog = StrapiBlog(content, filepath, file_obj['time'])
+        return blog
+
+def upload_blog(blog: StrapiBlog):
+    base_url = urljoin(BASE_URL, 'api/blogs')
+    slug = blog.get_slug()
+    search_url = base_url + f"?filters[slug_url][$eq]={slug}"
+    session = requests.Session()
+
+    if slug in existing_slugs_discovered:
+        existing_slugs_discovered[slug]['discovered'] = True
+
+    response = session.get(search_url, headers=headers)
+
+    if response.status_code == 200:
+        responses = json.loads(response.text)['data']
+        print(f'Uploading slug: {blog.get_slug()}')
+        if len(responses) > 0:
+            # Blog already exists at this slug
+            id = json.loads(response.text)['data'][0]['id']
+
+            url = f"{base_url}/{id}"
+            create_response = session.put(url, headers=headers, data=json.dumps(blog.get_post_json()))
+        else:
+            # Its a new blog
+            url = base_url
+            create_response = session.post(url, headers=headers, data=json.dumps(blog.get_post_json()))
+
+        if create_response.status_code == 200:
+            if slug in existing_slugs_discovered:
+                create_response_text = json.loads(create_response.text)
+                existing_slugs_discovered[slug]['id'] = create_response_text['data']['id']
+        else:
+            print(f'Error in parsing blog: {slug}')
+            print(create_response.text)
+            exit(1)
+
+def delete_old_blogs():
+    global existing_slugs_discovered
+
+    base_url = urljoin(BASE_URL, 'api/blogs')
+    session = requests.Session()
+
+    for slug in existing_slugs_discovered:
+        if not existing_slugs_discovered[slug]['discovered']:
+            print(f"Deleting slug: {slug}")
+            if existing_slugs_discovered[slug]['id'] > 0:
+                url = f"{base_url}/{id}"
+                response = session.delete(url, headers=headers)
+
+
+if __name__ == "__main__":
+    arg_parse()
+    items = load_items_from_json(args.directories)
+
+    load_existing_blogs()
+
+    files = find_files_to_upload(items)
+
+    print('Uploading blogs')
+    for file in tqdm(files):
+        blog = build_blog_object(file)
+        upload_blog(blog)
+
+    print('Deleting blogs')
+    delete_old_blogs()
diff --git a/blog/requirements.txt b/blog/requirements.txt
@@ -0,0 +1,6 @@
+certifi==2024.2.2
+charset-normalizer==3.3.2
+idna==3.6
+requests==2.31.0
+tqdm==4.66.2
+urllib3==2.2.1