Skip to content

Commit

Permalink
Merge pull request #2 from RyouMon/dev
Browse files Browse the repository at this point in the history
bump version to v0.0.4
  • Loading branch information
RyouMon authored Feb 19, 2022
2 parents 801d70d + f253204 commit 1007272
Show file tree
Hide file tree
Showing 10 changed files with 138 additions and 10 deletions.
27 changes: 24 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@ Crawl your personal favorite images, photo albums, comics from website.
Plan to support:
- pixiv.net (crawl your bookmarks for illust)
- yande.re (crawl posts that you voted)
- immpic.com
- immpic.com (crawl your favorites for albums)
- instagram.com
- nhentai.net

Already support:
- pixiv.net (must login), Thanks for project [PixivPy](https://github.com/upbit/pixivpy).
- yande.re (must login, only input your username)
- lmmpic.com (must login)

# Requirements
- Python3.6+
Expand All @@ -32,6 +33,11 @@ export https_proxy=http://localhost:8080
```

# Login

```
crawl login [-h] {pixiv,yandere,lemon}
```

## Login Pixiv
Thanks for [@ZipFile Pixiv OAuth Flow](https://gist.github.com/ZipFile/c9ebedb224406f4f11845ab700124362)
1. run command
Expand All @@ -54,20 +60,35 @@ Thanks for [@ZipFile Pixiv OAuth Flow](https://gist.github.com/ZipFile/c9ebedb22
```
2. input your username and hit the Enter key.
## Login Lmmpic
1. run command:
```
favors login lemon
```
2. input your username and hit the Enter key.
3. input your password and hit the Enter key.
# Crawl
## Crawl Pixiv
Before run this command, make sure you are already run `favors login pixiv`.
Before run this command, make sure you are already [login](#Login Pixiv).
```
favors crawl pixiv
```
## Crawl Yandere
Before run this command, make sure you are already run `favors login yandere`.
Before run this command, make sure you are already [login](#Login Yandere).
```
favors crawl yandere
```
## Crawl Lmmpic
Before run this command, make sure you are already [login](#Login Yandere).
```
favors crawl lemon
```
## Crawl All Support Site
```
favors crawl all
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = favorites_crawler
version = 0.0.3
version = 0.0.4
author = RyouMon
author_email = [email protected]
description = Crawl your personal favorite images, photo albums, comics from website. Support pixiv, yande.re for now.
Expand Down
1 change: 1 addition & 0 deletions src/favorites_crawler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
login_processors = {
'pixiv': auth.login_pixiv,
'yandere': auth.auth_yandere,
'lemon': auth.auth_lmmpic,
}


Expand Down
3 changes: 3 additions & 0 deletions src/favorites_crawler/constants/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
PIXIV_AUTH_TOKEN_URL = "https://oauth.secure.pixiv.net/auth/token"

YANDERE_POST_URL = 'https://yande.re/post.json'

LEMON_PIC_LOGIN_URL = 'https://www.lmmpic.com/wp-login.php'
LEMON_PIC_USER_FAVORITES_URL = 'https://www.lmmpic.com/user-center'
8 changes: 8 additions & 0 deletions src/favorites_crawler/itemloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,11 @@ class YanderePostItemLoader(ItemLoader):
"""Yandere Post Loader"""
default_item_class = items.YanderePostItem
default_output_processor = pc.take_first


class LemonPicPostItemLoader(ItemLoader):
default_item_class = items.LemonPicPostItem
default_output_processor = pc.take_first

image_urls_out = pc.identity
tags_out = pc.identity
23 changes: 18 additions & 5 deletions src/favorites_crawler/items.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import os.path
from typing import List
from urllib.parse import unquote
from dataclasses import dataclass, field

Expand Down Expand Up @@ -39,3 +36,19 @@ def get_filename(self):
filename = unquote(filename)
filename = drop_illegal_characters(filename)
return filename


@dataclass
class LemonPicPostItem:
id: int = field(default=None)
title: str = field(default=None)
image_urls: List = field(default=None)
tags: List = field(default=None)
referer: str = field(default=None)

def get_filename(self, url):
tags = ' '.join(self.tags)
folder = f'{self.title} [{tags}]'
name = url.rsplit('/', maxsplit=1)[1]
filename = os.path.join(folder, name)
return drop_illegal_characters(filename)
11 changes: 11 additions & 0 deletions src/favorites_crawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,14 @@ def get_media_requests(self, item, info):

def file_path(self, request, response=None, info=None, *, item=None):
return os.path.join('Yandere', item.get_filename())


class CollectionFilePipeline(FilesPipeline):

def get_media_requests(self, item, info):
item_dict = ItemAdapter(item).asdict()
referer = item_dict.get('referer')
return (Request(url, headers={'referer': referer}) for url in item_dict.get('image_urls', ()))

def file_path(self, request, response=None, info=None, *, item=None):
return item.get_filename(request.url)
56 changes: 56 additions & 0 deletions src/favorites_crawler/spiders/lemon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from scrapy.http import FormRequest, Request
from scrapy.spiders.crawl import CrawlSpider, Rule, LinkExtractor
from scrapy.exceptions import CloseSpider

from favorites_crawler.itemloaders import LemonPicPostItemLoader
from favorites_crawler.constants.endpoints import LEMON_PIC_LOGIN_URL, LEMON_PIC_USER_FAVORITES_URL
from favorites_crawler.utils.config import load_config


class LemonSpider(CrawlSpider):
name = 'lemon'
allowed_domains = ['lmmpic.com']
rules = [
Rule(
LinkExtractor(restrict_xpaths='//div[@class="my-favorite"]', allow='.+html', deny='#'),
callback='parse_item', follow=True,
),
Rule(
LinkExtractor(restrict_xpaths='//div[@class="page-links"]', allow='.+html/.+'),
callback='parse_item',
),
]
custom_settings = {
'ITEM_PIPELINES': {'favorites_crawler.pipelines.CollectionFilePipeline': 0},
}

def __init__(self, **kwargs):
super(LemonSpider, self).__init__(**kwargs)
config = load_config().get('lmmpic', {})
self.username = config.get('username', '')
self.password = config.get('password', '')

def start_requests(self):
yield Request(url=LEMON_PIC_LOGIN_URL, callback=self.login)

def login(self, response):
yield FormRequest.from_response(
formdata={'log': self.username, 'pwd': self.password},
response=response, callback=self.after_login,
)

def after_login(self, response):
if response.url != LEMON_PIC_USER_FAVORITES_URL:
self.logger.warn('Your username or password may not be valid.')
raise CloseSpider('not login')

for request_or_item in self._requests_to_follow(response):
yield request_or_item

def parse_item(self, response, **kwargs):
loader = LemonPicPostItemLoader(selector=response)
loader.add_xpath('title', '//h1[@class="entry-title"]/text()')
loader.add_xpath('image_urls', '//div[@class="single-content"]//img/@src')
loader.add_xpath('tags', '//a[@rel="tag"]/text()')
loader.add_value('referer', response.url)
yield loader.load_item()
15 changes: 15 additions & 0 deletions src/favorites_crawler/utils/auth.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from getpass import getpass
from base64 import urlsafe_b64encode
from hashlib import sha256
from secrets import token_urlsafe
Expand Down Expand Up @@ -108,3 +109,17 @@ def auth_yandere():
yandere_config['username'] = username
dump_config(config)
return yandere_config


def auth_lmmpic():
config = load_config()
try:
username = input("username: ").strip()
password = getpass("password: ")
except (EOFError, KeyboardInterrupt):
return
lmmpic_config = config.setdefault('lmmpic', {})
lmmpic_config['username'] = username
lmmpic_config['password'] = password
dump_config(config)
return lmmpic_config
2 changes: 1 addition & 1 deletion src/favorites_crawler/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ def load_config():
def dump_config(data):
"""Dump config data to user home"""
with open(config_file, 'w', encoding='utf8') as f:
yaml.safe_dump(data, f)
yaml.safe_dump(data, f, allow_unicode=True)

0 comments on commit 1007272

Please sign in to comment.