Skip to content

Commit

Permalink
Automate blog upload process (#333)
Browse files Browse the repository at this point in the history
  • Loading branch information
chirag-jn authored Apr 3, 2024
1 parent a160a5f commit cf5472e
Show file tree
Hide file tree
Showing 7 changed files with 356 additions and 0 deletions.
22 changes: 22 additions & 0 deletions .github/workflows/upload-blogs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: Upload blogs to Strapi

on:
push:
branches:
- vh-redesign

jobs:
run-script:
if: ${{ github.repository == 'superlinked/VectorHub' }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.x
uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: Install dependencies
run: |
pip install -r blog/requirements.txt
- name: Run script
run: python blog/main.py --directories blog/directories.json
8 changes: 8 additions & 0 deletions blog/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.env
.venv
./env/
./venv/
ENV/
env.bak/
venv.bak/
__pycache__/
17 changes: 17 additions & 0 deletions blog/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Blog Uploads

## Setup
```bash
cd blog
python3 -m venv env
source env/bin/activate
pip install -r requirements.txt
```

## Run
```bash
export STRAPI_URL="https://mystrapi.strapiapp.com"
export STRAPI_API_KEY="mystrapi-api-key"
source blog/env/bin/activate
python blog/main.py --directories blog/directories.json
```
42 changes: 42 additions & 0 deletions blog/directories.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
[
{
"type": "folder",
"name": "VectorHub",
"path": "docs",
"has_blogs": false,
"children": [
{
"type": "folder",
"name": "Building Blocks",
"path": "building_blocks",
"has_blogs": false,
"children": [
{
"type": "folder",
"name": "Vector Compute",
"path": "vector_compute",
"has_blogs": true
},
{
"type": "folder",
"name": "Vector Search",
"path": "vector_search",
"has_blogs": true
},
{
"type": "folder",
"name": "Data Sources",
"path": "data_sources",
"has_blogs": true
}
]
},
{
"type": "folder",
"name": "Use Cases",
"path": "use_cases",
"has_blogs": true
}
]
}
]
88 changes: 88 additions & 0 deletions blog/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import enum

GIT_REPO = 'https://github.com/superlinked/VectorHub'

class ItemType(enum.Enum):
FOLDER = "folder"
FILE = "file"

class Item:
def __init__(self, type, name, path, has_blogs=False, children=None):
self.type = type
self.name = name
self.path = path
self.has_blogs = has_blogs
self.children = []

if children:
self.add_children(children)

def __str__(self) -> str:
return self.path

def add_children(self, children):
for child in children:
# Recursively create DirectoryItem objects for children
self.children.append(Item.from_dict(child))

@classmethod
def from_dict(cls, data):
"""
Create an Item instance from a dictionary.
"""
return cls(
type=data.get("type", ItemType.FOLDER),
name=data.get("name", ""),
path=data.get("path", ""),
has_blogs=data.get("has_blogs", False),
children=data.get("children", [])
)

def to_dict(self):
"""
Convert the Item object to a dictionary.
"""
return {
"type": self.type,
"path": self.path,
"name": self.name,
"has_blogs": self.has_blogs,
"children": self.children,
}


class StrapiBlog:
def __init__(self, content, filepath, last_updated):
self.content = content
self.filepath = filepath
self.last_updated = last_updated

def get_title(self) -> str:
return os.path.basename(self.filepath).replace('-', ' ').replace('_', ' ').replace('.md', '')

def __str__(self) -> str:
return self.get_title()

def get_github_url(self):
return f'{GIT_REPO}/blob/main/{self.filepath}'


def get_slug(self):
slug = self.filepath.replace('.md', '').replace('_', '-').replace(' ', '-').replace('docs/', '').replace('&', '').replace('--', '-')
return slug.lower()

def get_json(self):
return {
"github_url": self.get_github_url(),
"content": self.content,
"github_last_updated_date": self.last_updated,
"title": self.get_title(),
"slug_url": self.get_slug()
}

def get_post_json(self):
return {"data": self.get_json()}

def __eq__(self, __value) -> bool:
self.get_slug() == __value.get_slug()
173 changes: 173 additions & 0 deletions blog/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import os
import json
import argparse
from json.decoder import JSONDecodeError
import requests
from urllib.parse import urljoin
from helpers import Item, ItemType, StrapiBlog
from tqdm.auto import tqdm
from datetime import datetime
from pathlib import Path

args = None

BASE_URL = os.getenv('STRAPI_URL', "")
API_KEY = os.getenv('STRAPI_API_KEY', "")

paths_to_search = []
existing_slugs_discovered = {}

headers = {
'Authorization': f'Bearer {API_KEY}',
'Content-Type': 'application/json'
}

def arg_parse():
global args
parser = argparse.ArgumentParser(description="VectorHub Strapi Upload")
parser.add_argument('--directories', help='Path to json which describes the directories to parse')
args = parser.parse_args()

def load_items_from_json(directories: str) -> list:
if os.path.exists(directories):
items = []
try:
with open(directories, 'r') as file:
data = json.load(file)
for item_data in data:
items.append(Item.from_dict(item_data))
except JSONDecodeError as e:
print('JSON Structure is invalid.')
exit(1)
except Exception as e:
print('Unknown error occured.')
print(e)
exit(1)
return items
else:
print(f"{directories} does not exist.")
exit(1)


def load_existing_blogs(page_num=1):
global existing_slugs_discovered
base_url = urljoin(BASE_URL, 'api/blogs')
search_url = base_url + f"?pagination[page]={page_num}"

session = requests.Session()

response = session.get(search_url, headers=headers)
if response.status_code == 200:
data = json.loads(response.text)['data']
if len(data) > 0:
for item in data:
existing_slugs_discovered[item['attributes']['slug_url']] = {'discovered': False, 'id': -1}
load_existing_blogs(page_num+1)


def fetch_paths(node: Item, current_path=""):
global paths_to_search
# Update the current path with dthe node's path
current_path = f"{current_path}/{node.path}" if current_path else node.path

# If the node has children, recurse on each child
if node.has_blogs:
paths_to_search.append(current_path)
if node.children and len(node.children) > 0:
for child in node.children:
fetch_paths(child, current_path)


def find_files_to_upload(items: list):
global paths_to_search

for item in items:
fetch_paths(item)

files = []

extension = 'md'

for path in paths_to_search:
folder_path = Path(path)
folder_files = folder_path.glob(f"*.{extension}")
for file in folder_files:
if 'readme.md' not in str(file).lower():
files.append({
'path': str(file),
'time': datetime.fromtimestamp(os.path.getmtime(file)).strftime("%Y-%m-%d")
})

return files


def build_blog_object(file_obj: dict) -> StrapiBlog:
filepath = file_obj['path']
with open(filepath, 'r') as file:
content = file.read()
blog = StrapiBlog(content, filepath, file_obj['time'])
return blog

def upload_blog(blog: StrapiBlog):
base_url = urljoin(BASE_URL, 'api/blogs')
slug = blog.get_slug()
search_url = base_url + f"?filters[slug_url][$eq]={slug}"
session = requests.Session()

if slug in existing_slugs_discovered:
existing_slugs_discovered[slug]['discovered'] = True

response = session.get(search_url, headers=headers)

if response.status_code == 200:
responses = json.loads(response.text)['data']
print(f'Uploading slug: {blog.get_slug()}')
if len(responses) > 0:
# Blog already exists at this slug
id = json.loads(response.text)['data'][0]['id']

url = f"{base_url}/{id}"
create_response = session.put(url, headers=headers, data=json.dumps(blog.get_post_json()))
else:
# Its a new blog
url = base_url
create_response = session.post(url, headers=headers, data=json.dumps(blog.get_post_json()))

if create_response.status_code == 200:
if slug in existing_slugs_discovered:
create_response_text = json.loads(create_response.text)
existing_slugs_discovered[slug]['id'] = create_response_text['data']['id']
else:
print(f'Error in parsing blog: {slug}')
print(create_response.text)
exit(1)

def delete_old_blogs():
global existing_slugs_discovered

base_url = urljoin(BASE_URL, 'api/blogs')
session = requests.Session()

for slug in existing_slugs_discovered:
if not existing_slugs_discovered[slug]['discovered']:
print(f"Deleting slug: {slug}")
if existing_slugs_discovered[slug]['id'] > 0:
url = f"{base_url}/{id}"
response = session.delete(url, headers=headers)


if __name__ == "__main__":
arg_parse()
items = load_items_from_json(args.directories)

load_existing_blogs()

files = find_files_to_upload(items)

print('Uploading blogs')
for file in tqdm(files):
blog = build_blog_object(file)
upload_blog(blog)

print('Deleting blogs')
delete_old_blogs()
6 changes: 6 additions & 0 deletions blog/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
certifi==2024.2.2
charset-normalizer==3.3.2
idna==3.6
requests==2.31.0
tqdm==4.66.2
urllib3==2.2.1

0 comments on commit cf5472e

Please sign in to comment.