-
-
Notifications
You must be signed in to change notification settings - Fork 20
/
metadata.py
246 lines (204 loc) · 8.62 KB
/
metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import json
import random
import re
import string
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING, Any, BinaryIO, TypedDict
if TYPE_CHECKING:
from .parse_job import ParseJobData
@dataclass
class MetaDataResult:
book_id: int = 0
book_lang: str = ""
book_fmts: list[str] = field(default_factory=list)
book_paths: list[str] = field(default_factory=list)
mi: Any = None
support_ww_list: list[bool] = field(default_factory=list)
support_x_ray: bool = False
def is_ww_supported(book_lang: str, gloss_lang: str) -> bool:
from .utils import get_plugin_path, load_languages_data
lang_dict = load_languages_data(get_plugin_path())
lang_data = lang_dict.get(gloss_lang, {})
supported_codes = lang_data.get("lemma_languages", [])
if len(supported_codes) == 0 and lang_data["gloss_source"] == "kaikki":
supported_codes = lang_dict.keys()
return book_lang in supported_codes
def check_metadata(gui: Any, book_id: int, custom_x_ray: bool) -> MetaDataResult | None:
from calibre.utils.localization import lang_as_iso639_1
from .config import prefs
from .error_dialogs import unsupported_format_dialog, unsupported_language_dialog
from .utils import get_plugin_path, load_languages_data
db = gui.current_db.new_api
lang_dict = load_languages_data(get_plugin_path(), False)
mi = db.get_metadata(book_id, get_cover=True)
# https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
book_lang = lang_as_iso639_1(mi.get("language"))
if book_lang not in lang_dict:
unsupported_language_dialog(mi.get("title"))
return None
book_fmts = db.formats(book_id)
supported_fmts = [f for f in prefs["preferred_formats"] if f in book_fmts]
if not supported_fmts:
unsupported_format_dialog()
return None
if len(supported_fmts) > 1 and prefs["choose_format_manually"] and not custom_x_ray:
from .config import ChooseFormatDialog
choose_format_dlg = ChooseFormatDialog(supported_fmts)
if choose_format_dlg.exec():
supported_fmts = [choose_format_dlg.chosen_format]
else:
return None
if not prefs["use_all_formats"]:
supported_fmts = [supported_fmts[0]]
support_ww_list = []
for fmt in supported_fmts:
gloss_lang = prefs["gloss_lang"]
support_ww_list.append(is_ww_supported(book_lang, gloss_lang))
return MetaDataResult(
book_id=book_id,
book_lang=book_lang,
book_fmts=supported_fmts,
book_paths=[db.format_abspath(book_id, fmt) for fmt in supported_fmts],
mi=mi,
support_ww_list=support_ww_list,
support_x_ray=lang_dict[book_lang]["spacy"] != "",
)
def cli_check_metadata(book_path_str: str, log: Any) -> MetaDataResult | None:
from calibre.utils.localization import lang_as_iso639_1
from .config import prefs
from .utils import get_plugin_path, load_languages_data
book_path = Path(book_path_str)
book_fmt = book_path.suffix.upper()[1:]
mi = None
if book_fmt == "KFX":
from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.localization import canonicalize_lang
from calibre_plugins.kfx_input.kfxlib import YJ_Book
yj_book = YJ_Book(str(book_path))
yj_md = yj_book.get_metadata()
title = getattr(yj_md, "title", None)
language = getattr(yj_md, "language", None)
mi = Metadata(title)
mi.language = canonicalize_lang(language)
elif book_fmt == "EPUB":
from calibre.ebooks.metadata.epub import get_metadata
with book_path.open("rb") as f:
mi = get_metadata(f, False)
elif book_fmt in ["AZW3", "AZW", "MOBI"]:
from calibre.ebooks.metadata.mobi import get_metadata
with book_path.open("rb") as f:
mi = get_metadata(f)
if mi is not None:
lang_dict = load_languages_data(get_plugin_path(), False)
book_lang = lang_as_iso639_1(mi.get("language"))
if book_lang not in lang_dict:
log.prints(
log.WARN,
f"The language of the book {mi.get('title')} is not supported.",
)
return None
gloss_lang = prefs["gloss_lang"]
return MetaDataResult(
book_fmts=[book_fmt],
mi=mi,
book_lang=book_lang,
support_ww_list=[is_ww_supported(book_lang, gloss_lang)],
support_x_ray=lang_dict[book_lang]["spacy"] != "",
)
log.prints(log.WARN, "The book format is not supported.")
return None
def random_asin() -> str:
"return an invalid ASIN"
asin = "BB"
asin += "".join(random.choices(string.ascii_uppercase + string.digits, k=8))
return asin
def validate_asin(asin: str | None, mi: Any) -> str:
# check ASIN, create a random one if doesn't exist
if asin is None or re.fullmatch(r"B[0-9A-Z]{9}", asin) is None:
asin = random_asin()
mi.set_identifier("mobi-asin", asin)
return asin
class KFXJson(TypedDict):
position: int
content: str
type: int
def get_asin_etc(data: "ParseJobData", set_en_lang: bool = False) -> None:
if data.book_fmt == "KFX":
from calibre_plugins.kfx_input.kfxlib import YJ_Book
yj_book = YJ_Book(data.book_path)
yj_md = yj_book.get_metadata()
book_asin = getattr(yj_md, "asin", "")
data.acr = getattr(yj_md, "asset_id", "")
book_lang = getattr(yj_md, "language", "en")
data.asin = validate_asin(book_asin, data.mi)
update_asin = data.asin != book_asin
update_lang = False
if set_en_lang and book_lang != "en":
update_lang = True
book_lang = "en"
if update_asin or update_lang:
yj_book = update_kfx_metadata(data.book_path, data.asin, book_lang)
data.kfx_json = json.loads(yj_book.convert_to_json_content())["data"]
elif data.book_fmt != "EPUB":
from calibre.ebooks.metadata.mobi import MetadataUpdater
with open(data.book_path, "r+b") as f:
data.acr = f.read(32).rstrip(b"\x00").decode("utf-8") # Palm db name
data.revision = get_mobi_revision(f)
f.seek(0)
mu = MetadataUpdater(f)
data.mobi_codec = mu.codec
asin_bytes = mu.original_exth_records.get(
113
) or mu.original_exth_records.get(504)
book_asin = asin_bytes.decode(mu.codec) if asin_bytes is not None else None
data.asin = validate_asin(book_asin, data.mi)
locale = mu.record0[0x5C:0x60] # MOBI header locale
mi_lang = data.mi.language
update_asin = data.asin != book_asin
update_lang = False
if set_en_lang and locale[2:] != (9).to_bytes(2, "big"):
update_lang = True
locale = (9).to_bytes(4, "big")
mi_lang = "eng"
if update_asin or update_lang:
data.mi.language = mi_lang
mu.record0[0x5C:0x60] = locale
mu.update(data.mi, asin=data.asin)
data.mobi_html = extract_mobi(data.book_path)
def get_mobi_revision(f: BinaryIO) -> str:
# modified from calibre.ebooks.mobi.reader.headers:MetadataHeader.header
f.seek(78)
f.seek(int.from_bytes(f.read(4), "big") + 32)
return f.read(4).hex() # Unique-ID MOBI header
def extract_mobi(book_path: str) -> bytes:
# use code from calibre.ebooks.mobi.reader.mobi8:Mobi8Reader.__call__
# and calibre.ebook.conversion.plugins.mobi_input:MOBIInput.convert
# https://github.com/kevinhendricks/KindleUnpack/blob/master/lib/mobi_k8proc.py#L216
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
with open(book_path, "rb") as f:
mr = MobiReader(f)
if mr.kf8_type == "joint":
raise Exception("JointMOBI")
mr.check_for_drm()
mr.extract_text()
html = mr.mobi_html
if mr.kf8_type == "standalone":
m8r = Mobi8Reader(mr, mr.log)
m8r.kf8_sections = mr.sections
m8r.read_indices()
m8r.build_parts()
html = b"".join(m8r.parts) # KindleUnpack
return html
def update_kfx_metadata(book_path: str, asin: str, lang: str) -> Any:
from calibre_plugins.kfx_input.kfxlib import YJ_Book, YJ_Metadata
yj_book = YJ_Book(book_path)
yj_md = YJ_Metadata()
yj_md.asin = asin
yj_md.language = lang
yj_md.content_type = "EBOK"
yj_book.decode_book(set_metadata=yj_md)
with open(book_path, "wb") as f:
f.write(yj_book.convert_to_single_kfx())
return yj_book