-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #32 from ChenghaoMou/main
Tracking Dark Visitors Automatically
- Loading branch information
Showing
6 changed files
with
169 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
name: Daily Update from Dark Visitors | ||
on: | ||
schedule: | ||
- cron: "0 0 * * *" | ||
|
||
jobs: | ||
dark-visitors: | ||
runs-on: ubuntu-latest | ||
name: dark-visitors | ||
steps: | ||
- uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 2 | ||
- run: | | ||
pip install beautifulsoup4 requests | ||
git config --global user.name "dark-visitors" | ||
git config --global user.email "[email protected]" | ||
python code/dark_visitors.py | ||
git add -A | ||
git diff --quiet && git diff --staged --quiet || (git commit -m "Daily update from Dark Visitors" && git push) | ||
shell: bash | ||
call-main: | ||
needs: dark-visitors | ||
uses: ./.github/workflows/main.yml | ||
secrets: inherit | ||
with: | ||
message: "Daily update from Dark Visitors" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
on: | ||
workflow_call: | ||
inputs: | ||
message: | ||
type: string | ||
required: true | ||
description: The message to commit | ||
push: | ||
paths: | ||
- 'robots.json' | ||
|
@@ -23,6 +29,10 @@ jobs: | |
git config --global user.name "ai.robots.txt" | ||
git config --global user.email "[email protected]" | ||
git add -A | ||
git commit -m "${{ github.event.head_commit.message }}" | ||
if [ -n "${{ inputs.message }}" ]; then | ||
git commit -m "${{ inputs.message }}" | ||
else | ||
git commit -m "${{ github.event.head_commit.message }}" | ||
fi | ||
git push | ||
shell: bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import json | ||
from pathlib import Path | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
session = requests.Session() | ||
response = session.get("https://darkvisitors.com/agents") | ||
soup = BeautifulSoup(response.text, "html.parser") | ||
|
||
existing_content = json.loads(Path("./robots.json").read_text()) | ||
to_include = [ | ||
"AI Assistants", | ||
"AI Data Scrapers", | ||
"AI Search Crawlers", | ||
# "Archivers", | ||
# "Developer Helpers", | ||
# "Fetchers", | ||
# "Intelligence Gatherers", | ||
# "Scrapers", | ||
# "Search Engine Crawlers", | ||
# "SEO Crawlers", | ||
# "Uncategorized", | ||
"Undocumented AI Agents" | ||
] | ||
|
||
for section in soup.find_all("div", {"class": "agent-links-section"}): | ||
category = section.find("h2").get_text() | ||
if category not in to_include: | ||
continue | ||
for agent in section.find_all("a", href=True): | ||
name = agent.find("div", {"class": "agent-name"}).get_text().strip() | ||
desc = agent.find("p").get_text().strip() | ||
|
||
default_values = { | ||
"Unclear at this time.", | ||
"No information. provided.", | ||
"No information.", | ||
"No explicit frequency provided." | ||
} | ||
default_value = "Unclear at this time." | ||
|
||
# Parse the operator information from the description if possible | ||
operator = default_value | ||
if "operated by " in desc: | ||
try: | ||
operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip() | ||
except Exception as e: | ||
print(f"Error: {e}") | ||
|
||
def consolidate(field: str, value: str) -> str: | ||
# New entry | ||
if name not in existing_content: | ||
return value | ||
# New field | ||
if field not in existing_content[name]: | ||
return value | ||
# Unclear value | ||
if existing_content[name][field] in default_values and value not in default_values: | ||
return value | ||
# Existing value | ||
return existing_content[name][field] | ||
|
||
existing_content[name] = { | ||
"operator": consolidate("operator", operator), | ||
"respect": consolidate("respect", default_value), | ||
"function": consolidate("function", f"{category}"), | ||
"frequency": consolidate("frequency", default_value), | ||
"description": consolidate("description", f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}") | ||
} | ||
|
||
print(f"Total: {len(existing_content)}") | ||
Path("./robots.json").write_text(json.dumps(existing_content, indent=4)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.