-
Notifications
You must be signed in to change notification settings - Fork 128
/
Copy pathurlscraper.py
72 lines (58 loc) · 2.42 KB
/
urlscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import re
import asyncio
from telethon import TelegramClient
from telethon.tl.types import MessageEntityTextUrl
import pandas as pd
from colorama import Fore, Style
import details as ds
# Login details
api_id = ds.apiID
api_hash = ds.apiHash
phone = ds.number
async def main():
client = TelegramClient(phone, api_id, api_hash)
await client.start()
if not await client.is_user_authorized():
await client.send_code_request(phone)
await client.sign_in(phone, input('Enter the code: '))
print(
f'{Fore.CYAN}Please enter a target Telegram channel (e.g. https://t.me/{Fore.LIGHTYELLOW_EX}your_channel{Fore.CYAN}):{Style.RESET_ALL}\n')
print()
while True:
try:
channel_name = input("Please enter a Telegram channel name: ")
print(f'You entered "{channel_name}"')
answer = input('Is this correct? (y/n) ')
if answer == 'y':
print(f'Scraping URLs from {channel_name}...')
break
except Exception:
continue
urls = set() # Use a set to deduplicate URLs
async for message in client.iter_messages(channel_name):
if message.entities is not None:
for entity in message.entities:
if isinstance(entity, MessageEntityTextUrl):
url = entity.url
if 'https://t.me/' in url:
if match := re.match(
r'https?://t\.me/([^/\s]+)/?', url
):
channel_link = f'https://t.me/{match[1]}'
urls.add(channel_link)
print(f"URL - {Fore.CYAN}https://t.me/{match[1]}{Style.RESET_ALL}")
elif message.text and isinstance(message.text, str):
matches = re.findall(r'https?://t\.me/([^/\s]+)/?', message.text)
for match in matches:
channel_link = f'https://t.me/{match}'
urls.add(channel_link)
print(f"URL - {Fore.CYAN}https://t.me/{match}{Style.RESET_ALL}")
urls_folder = 'URLs'
os.makedirs(urls_folder, exist_ok=True)
output_filename = os.path.join(urls_folder, f'{channel_name}.csv')
with open(output_filename, 'w', encoding='utf-8') as file:
file.write('\n'.join(urls))
print(f'URLs scraped successfully. Saved to: {output_filename}')
if __name__ == '__main__':
asyncio.run(main())