Skip to content

Commit

Permalink
Merge pull request #49 from RyouMon/feature-image-metadata
Browse files Browse the repository at this point in the history
Store tags(iptc/keywords) to pixiv and yandere image files.
  • Loading branch information
RyouMon authored Jan 7, 2024
2 parents d93b88c + ae3f7f2 commit 5e77115
Show file tree
Hide file tree
Showing 19 changed files with 171 additions and 74 deletions.
17 changes: 11 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,20 +133,25 @@ yandere:
## Organize file by artist
if you want to organize pixiv illust by user, add this line to your config:
```yaml
...
pixiv:
# FAVORS_PIXIV_ENABLE_ORGANIZE_BY_USER: true # (Deprecation)
ENABLE_ORGANIZE_BY_ARTIST: true # add this line to your yandere config
...
...
```
if you want to organize yandere post by artist, add this line to your config:
```yaml
...
yandere:
ENABLE_ORGANIZE_BY_ARTIST: true # add this line to your yandere config
...
...
```
## Store tags to IPTC/Keywords
only support pixiv and yandere.
```yaml
yandere:
ENABLE_WRITE_IPTC_KEYWORDS: true # default: true
EXIF_TOOL_EXECUTABLE: '<Path to your exiftool executable>' # default None
pixiv:
ENABLE_WRITE_IPTC_KEYWORDS: true # default: true
EXIF_TOOL_EXECUTABLE: '<Path to your exiftool executable>' # default None
```
# Restore your favorites
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ requests==2.27.1
pyyaml==6.0
itemadapter==0.4.0
itemloaders==1.0.4
Twisted==22.10.0
Twisted==22.10.0
pyexiftool==0.5.6
2 changes: 1 addition & 1 deletion src/favorites_crawler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def crawl(name, **kwargs):

def spider_closed(spider):
stats = spider.crawler.stats.get_stats()
if not (stats.get('item_scrapped_count', 0) + stats.get('item_dropped_count', 0)):
if not (stats.get('item_scraped_count', 0) + stats.get('item_dropped_count', 0)):
logger.warning('Your cookies or token may have expired.')


Expand Down
2 changes: 1 addition & 1 deletion src/favorites_crawler/constants/headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
PIXIV_REQUEST_HEADERS = {
'APP-OS': 'ios',
'APP-OS-Version': '12.6',
'Accept-Language': 'zh-CH;zh;q=0.9;en;q=0.8',
'Accept-Language': 'en',
}
8 changes: 5 additions & 3 deletions src/favorites_crawler/itemloaders.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from itemloaders import ItemLoader
from itemloaders.processors import Join, Compose, MapCompose
from itemloaders.processors import Compose, MapCompose

from favorites_crawler import items
from favorites_crawler.processors import take_first, identity, get_nhentai_id, wrap_credits, \
original_url_from_nhentai_thumb_url, select_best_nhentai_title, clean_nhentai_title, \
get_year_from_iso_format, get_month_from_iso_format, get_series_from_title, get_volume_from_title, \
clean_parodies, get_page
clean_parodies, get_lemon_page, get_pixiv_tags, get_yandere_tags


class PixivIllustItemLoader(ItemLoader):
Expand All @@ -15,6 +15,7 @@ class PixivIllustItemLoader(ItemLoader):

file_urls_out = identity
user_id_out = Compose(take_first, str)
tags_out = get_pixiv_tags


class YanderePostItemLoader(ItemLoader):
Expand All @@ -24,6 +25,7 @@ class YanderePostItemLoader(ItemLoader):

file_urls_out = identity
artist_out = Compose(take_first, lambda s: s.strip())
tags_out = Compose(take_first, get_yandere_tags)


class NHentaiGalleryItemLoader(ItemLoader):
Expand Down Expand Up @@ -51,4 +53,4 @@ class LemonPicPostItemLoader(ItemLoader):

file_urls_out = identity
tags_out = identity
page_out = Compose(take_first, get_page)
page_out = Compose(take_first, get_lemon_page)
3 changes: 0 additions & 3 deletions src/favorites_crawler/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def get_folder_name(self, spider):

@dataclass
class ComicBookInfoItem:

title: str = field(default=None, metadata={'is_comic_info': True})
series: str = field(default=None, metadata={'is_comic_info': True})
publisher: str = field(default=None, metadata={'is_comic_info': True})
Expand Down Expand Up @@ -71,7 +70,6 @@ def get_comic_info(self):

@dataclass
class PixivIllustItem(BaseItem):

user_id: str = field(default=None)

def get_folder_name(self, spider):
Expand All @@ -83,7 +81,6 @@ def get_folder_name(self, spider):

@dataclass
class YanderePostItem(BaseItem):

artist: str = field(default=None)

def get_folder_name(self, spider):
Expand Down
80 changes: 65 additions & 15 deletions src/favorites_crawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,79 @@
from scrapy.pipelines.files import FilesPipeline
from itemadapter import ItemAdapter
from twisted.python.failure import Failure
from exiftool import ExifToolHelper

from favorites_crawler.utils.files import create_comic_archive


logger = logging.getLogger(__name__)


class FavoritesFilePipeline(FilesPipeline):
class BasePipeline(FilesPipeline):
def get_media_requests(self, item, info):
item_dict = ItemAdapter(item).asdict()
referer = item_dict.get('referer')
return (Request(url, headers={'referer': referer}) for url in item_dict.get(self.files_urls_field, ()))

def file_path(self, request, response=None, info=None, *, item=None):
return item.get_filepath(request.url, info.spider)

def item_completed(self, results, item, info):
for result in info.downloaded.values():
if isinstance(result, Failure):
logger.error('Error when downloading file: %s', result.value)
return super().item_completed(results, item, info)


class PicturePipeline(BasePipeline):
"""Save image and add iptc/keywords to it."""
def __init__(self, store_uri, download_func=None, settings=None):
super().__init__(store_uri, download_func=download_func, settings=settings)
self.write_iptc_keywords = settings.getbool('ENABLE_WRITE_IPTC_KEYWORDS', False)
if self.write_iptc_keywords:
try:
self.exif_tool = ExifToolHelper(executable=settings.get('EXIF_TOOL_EXECUTABLE', None))
self.exif_tool.run()
except Exception as e:
logger.error('Failed to load exiftool, consider to install it or setting EXIF_TOOL_EXECUTABLE. '
'\nException: %r', e)
self.exif_tool = None
else:
self.exif_tool = None

def close_spider(self, _):
if self.exif_tool and self.exif_tool.running:
self.exif_tool.terminate()

def item_completed(self, results, item, info):
item = super().item_completed(results, item, info)
if not self.exif_tool:
return item

for success, result in results:
if not (success and item.tags):
continue
path = item.get_filepath(result['url'], info.spider)
try:
msg = self.exif_tool.set_tags(
Path(self.store.basedir) / path,
{'Keywords': item.tags},
['-overwrite_original'],
).rstrip()
except Exception as e:
logger.error('Failed to write tags: %r to "%s", result: %r', item.tags, path, e)
else:
if msg == '1 image files updated':
info.spider.crawler.stats.inc_value('iptc_status_count/updated')
logger.debug('Success to write tags: %r to "%s", result: %s', item.tags, path, msg)
else:
logger.error('Failed to write tags: %r to "%s", result: %s', item.tags, path, msg)

return item


class ComicPipeline(BasePipeline):
"""Archive comic as cbz and add ComicBookInfo to it."""
def __init__(self, store_uri, **kwargs):
super().__init__(store_uri, **kwargs)
self.files_path = Path(store_uri).resolve()
Expand All @@ -44,17 +108,3 @@ def process_item(self, item, spider):
self.comic_comments[title] = bytes(comment, encoding='utf-8')

return super().process_item(item, spider)

def get_media_requests(self, item, info):
item_dict = ItemAdapter(item).asdict()
referer = item_dict.get('referer')
return (Request(url, headers={'referer': referer}) for url in item_dict.get(self.files_urls_field, ()))

def file_path(self, request, response=None, info=None, *, item=None):
return item.get_filepath(request.url, info.spider)

def item_completed(self, results, item, info):
for result in info.downloaded.values():
if isinstance(result, Failure):
logger.error('Error when downloading file: %s', result.value)
return super().item_completed(results, item, info)
20 changes: 19 additions & 1 deletion src/favorites_crawler/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,26 @@ def clean_parodies(parodies):
return parodies.strip()


def get_page(url):
def get_lemon_page(url):
match = re.match(r'https://www\..+html/(\d+)', url)
if not match:
return 1
return int(match.group(1))


def get_pixiv_tags(tags):
"""Return en-us tags."""
results = set()
for tag in tags:
if tag.get('name'):
results.add(tag['name'].strip().replace(' ', '_').lower())
if tag.get('translated_name'):
results.add(tag['translated_name'].strip().replace(' ', '_').lower())
return list(filter(
lambda x: re.match(r'^[ -~]+$', x), # ascii only
results,
))


def get_yandere_tags(tags):
return tags.split(' ')
6 changes: 4 additions & 2 deletions src/favorites_crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

TELNETCONSOLE_ENABLED = False

ITEM_PIPELINES = {'favorites_crawler.pipelines.FavoritesFilePipeline': 0}

FAVORS_PIXIV_ENABLE_ORGANIZE_BY_USER = False

# ExifTool settings
ENABLE_WRITE_IPTC_KEYWORDS = True
EXIF_TOOL_EXECUTABLE = None
3 changes: 3 additions & 0 deletions src/favorites_crawler/spiders/lemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ class LemonSpider(BaseSpider):
callback='parse',
),
]
custom_settings = {
'ITEM_PIPELINES': {'favorites_crawler.pipelines.ComicPipeline': 0},
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down
1 change: 1 addition & 0 deletions src/favorites_crawler/spiders/nhentai.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class NHentaiSpider(BaseSpider):
)
custom_settings = {
'CONCURRENT_REQUESTS': 5,
'ITEM_PIPELINES': {'favorites_crawler.pipelines.ComicPipeline': 0},
}

def __init__(self, *args, **kwargs):
Expand Down
1 change: 1 addition & 0 deletions src/favorites_crawler/spiders/pixiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class PixivSpider(BaseSpider):
'DEFAULT_REQUEST_HEADERS': PIXIV_REQUEST_HEADERS,
# Add PixivAuthorizationMiddleware after DefaultHeadersMiddleware
'DOWNLOADER_MIDDLEWARES': {'favorites_crawler.middlewares.PixivAuthorizationMiddleware': 450},
'ITEM_PIPELINES': {'favorites_crawler.pipelines.PicturePipeline': 0},
}

def start_requests(self):
Expand Down
23 changes: 15 additions & 8 deletions src/favorites_crawler/spiders/yandere.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from favorites_crawler.constants.domains import YANDERE_DOMAIN
from favorites_crawler.itemloaders import YanderePostItemLoader
from favorites_crawler.constants.endpoints import YANDERE_LIST_POST_URL, YANDERE_SHOW_POST_URL
from favorites_crawler.utils.files import list_yandere_id
from favorites_crawler.utils.files import list_yandere_post


class YandereSpider(BaseSpider):
Expand All @@ -17,18 +17,17 @@ class YandereSpider(BaseSpider):
allowed_domains = (YANDERE_DOMAIN, )
custom_settings = {
'CONCURRENT_REQUESTS': 5,
'ITEM_PIPELINES': {'favorites_crawler.pipelines.PicturePipeline': 0},
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.limit = 100
self.params = {'page': 1, 'limit': self.limit}
self.posts = set()
self.posts = {}

def start_requests(self):
self.posts = set(list_yandere_id(pathlib.Path(self.settings.get('FILES_STORE')), include_subdir=True))
self.logger.debug(f'{len(self.posts)} posts will skip download.')

self.posts = list_yandere_post(pathlib.Path(self.settings.get('FILES_STORE')), include_subdir=True)
username = self.custom_settings.get('USERNAME')
if not username:
raise CloseSpider('Did you run "favors login yandere"?')
Expand All @@ -45,12 +44,20 @@ def parse_start_url(self, response, **kwargs):
yield Request(f'{YANDERE_LIST_POST_URL}?{urlencode(self.params)}', callback=self.parse_start_url)

for post in posts:
if str(post['id']) in self.posts:
continue
post_id = str(post['id'])
if post_id in self.posts:
path = self.posts[post_id] # type: pathlib.Path
if (path.name ==
YanderePostItemLoader.default_item_class().get_filename(post['file_url'], self)):
continue
path.unlink(missing_ok=True)

loader = YanderePostItemLoader()
loader.add_value('file_urls', post['file_url'])
loader.add_value('tags', post['tags'])

if self.settings.getbool('ENABLE_ORGANIZE_BY_ARTIST'):
yield Request(YANDERE_SHOW_POST_URL.format(id=post['id']),
yield Request(YANDERE_SHOW_POST_URL.format(id=post_id),
callback=self.parse, cb_kwargs={'loader': loader})
else:
yield loader.load_item()
Expand Down
10 changes: 4 additions & 6 deletions src/favorites_crawler/spiders/yandere_vote.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from favorites_crawler.spiders import BaseSpider
from favorites_crawler.constants.domains import YANDERE_DOMAIN
from favorites_crawler.constants.endpoints import YANDERE_VOTE_POST_URL
from favorites_crawler.utils.files import list_yandere_id
from favorites_crawler.utils.files import list_yandere_post


class YandereVoteSpider(BaseSpider):
Expand All @@ -27,12 +27,10 @@ def __init__(self, csrf_token, cookie, score, path, *args, **kwargs):
self.path = Path(path)

def start_requests(self):
yandere_id_list = list_yandere_id(self.path)
self.crawler.stats.set_value('file_count', len(yandere_id_list))
yandere_id_set = set(yandere_id_list)
self.crawler.stats.set_value('voted/expected', len(yandere_id_set))
yandere_id_list = list(list_yandere_post(self.path).keys())
self.crawler.stats.set_value('voted/expected', len(yandere_id_list))

for i in yandere_id_set:
for i in yandere_id_list:
yield FormRequest(YANDERE_VOTE_POST_URL,
formdata={'id': str(i), 'score': str(self.score)},
cookies=self.cookies, headers=self.headers,
Expand Down
8 changes: 4 additions & 4 deletions src/favorites_crawler/utils/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ def create_comic_archive(path: Path, comment=b''):
return archive_name


def list_yandere_id(path=Path('.'), include_subdir=False, result=None):
result = [] if result is None else result
def list_yandere_post(path=Path('.'), include_subdir=False, result=None):
result = {} if result is None else result
for file_or_dir in path.iterdir():
if file_or_dir.is_file():
id_ = get_yandere_post_id(file_or_dir.name)
if id_:
result.append(id_)
result[id_] = file_or_dir
elif include_subdir:
list_yandere_id(file_or_dir, include_subdir, result)
list_yandere_post(file_or_dir, include_subdir, result)
return result
2 changes: 1 addition & 1 deletion tests/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_spider_closed_should_call_warn(mock_logger):
@pytest.mark.parametrize('item_scrapped_count,item_dropped_count', ((1, 0), (0, 1), (1, 1)))
def test_spider_closed_should_not_call_warn(mock_logger, item_scrapped_count, item_dropped_count):
mock_spider = MagicMock()
stats = {'item_scrapped_count': item_scrapped_count, 'item_dropped_count': item_dropped_count}
stats = {'item_scraped_count': item_scrapped_count, 'item_dropped_count': item_dropped_count}
mock_spider.crawler.stats.get_stats.return_value = {k: v for k, v in stats.items() if v}

spider_closed(mock_spider)
Expand Down
Loading

0 comments on commit 5e77115

Please sign in to comment.