forked from emt-project/emt-static
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_report.py
77 lines (70 loc) · 2.04 KB
/
create_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import glob
import os
import pandas as pd
from acdh_cidoc_pyutils import extract_begin_end
from acdh_tei_pyutils.tei import TeiReader
from acdh_tei_pyutils.utils import extract_fulltext
from tqdm import tqdm
files = sorted(glob.glob("./data/editions/*.xml"))
transcribed = []
rs_tagged = []
abstract = []
items = []
for x in tqdm(files):
_, doc_id = os.path.split(x)
item = {"id": doc_id}
doc = TeiReader(x)
fulltext = extract_fulltext(doc.any_xpath(".//tei:body")[0])
if fulltext:
transcribed.append(x)
item["full_text"] = "ja"
else:
item["full_text"] = "nein"
rs_tags = doc.any_xpath(".//tei:body//tei:rs")
if rs_tags:
rs_tagged.append(x)
item["entities"] = "ja"
else:
item["entities"] = "nein"
try:
regest = extract_fulltext(doc.any_xpath(".//tei:abstract[@n='regest']")[0])
except:
regest = ""
if regest:
abstract.append(x)
item["regest"] = "ja"
else:
item["regest"] = "nein"
try:
item["sender_name"] = doc.any_xpath(
".//tei:correspAction[@type='sent']/tei:persName/text()"
)[0]
except IndexError:
item["sender_name"] = ""
try:
item["sender_id"] = doc.any_xpath(
".//tei:correspAction[@type='sent']/tei:persName/@ref"
)[0]
except IndexError:
item["sender_id"] = ""
try:
item["receiver_name"] = doc.any_xpath(
".//tei:correspAction[@type='received']/tei:persName/text()"
)[0]
except IndexError:
item["receiver_name"] = ""
try:
item["receiver_id"] = doc.any_xpath(
".//tei:correspAction[@type='received']/tei:persName/@ref"
)[0]
except IndexError:
item["receiver_id"] = ""
try:
item["sender_date"] = extract_begin_end(
doc.any_xpath(".//tei:correspAction[@type='sent']/tei:date")[0]
)[0]
except IndexError:
item["sender_date"] = ""
items.append(item)
df = pd.DataFrame(items)
df.to_csv("html/report.csv", index=False)