-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path__init__.py
131 lines (99 loc) · 3.82 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from project1 import entity_finders
from typing import Tuple
import re
import en_core_web_md
from spacy.tokens import Doc, Span
from spacy_wordnet.wordnet_annotator import WordnetAnnotator
import nltk
nltk.download("wordnet", quiet=True)
nltk.download("omw", quiet=True)
BLOCK_CHARACTER = '\u2588'
def redact_pipeline(
unredacted_txt: str,
redacts: dict[str, bool],
concepts: list[str]) -> Tuple[str, str]:
"""Redacts the given text based on 'redacts' and 'concepts'
Parameters
----------
unredacted_txt : Content of the document to be redacted.
redacts : Dict containing redactions to be made
concepts : List of concepts used to redact similar sentences
Returns
-------
redacted_txt,
stats : Redacted text and statistics tuple are returned
"""
# Removing any multiple spaces
unredacted_txt = re.sub(r' +', ' ', unredacted_txt)
nlp = en_core_web_md.load()
doc = nlp(unredacted_txt)
redactions: list[Span] = []
if 'address' in redacts and redacts['address'] is True:
redactions += entity_finders.address_finder(doc, nlp=nlp)
if 'names' in redacts and redacts['names'] is True:
redactions += entity_finders.names_finder(doc, nlp=nlp)
if 'genders' in redacts and redacts['genders'] is True:
redactions += entity_finders.genders_finder(doc, nlp=nlp)
if 'dates' in redacts and redacts['dates'] is True:
redactions += entity_finders.dates_finder(doc, nlp=nlp)
if 'phones' in redacts and redacts['phones'] is True:
redactions += entity_finders.phones_finder(doc, nlp=nlp)
if len(concepts) > 0:
nlp = en_core_web_md.load()
# Adding Spacy and Wordnet custom Annotator
# Enables the use of token._.wordnet.synsets() and
# token._.wordnet.lemmas()
nlp.add_pipe(
"spacy_wordnet",
after='tagger',
config={
'lang': nlp.lang})
doc = nlp(unredacted_txt)
with doc.retokenize() as retokenizer:
for ent in doc.ents:
retokenizer.merge(ent)
redactions += entity_finders.concept_finder(
doc, nlp=nlp, concepts=concepts)
return redact(doc, redactions), get_stats(doc, redactions)
def redact(doc: Doc, redactions: list[Span]) -> str:
"""Redacts the list of Spans in the given document
Parameters
----------
doc : Raw unredacted document
redactions : List of SpaCy Spans that needs to be redacted in doc
Returns
-------
redacted_txt: Redacted text
"""
redacted_text = doc.text
for ent in sorted(redactions, key=lambda s: s.start, reverse=True):
redacted_text = redacted_text[:ent.start_char] + BLOCK_CHARACTER * len(
redacted_text[ent.start_char: ent.end_char]) + redacted_text[ent.end_char:]
return redacted_text
def get_stats(doc: Doc, redactions: list[Span]) -> str:
"""Generates the statistics of the given document and redacted Spans
Parameters
----------
doc : Raw unredacted document
redactions : List of SpaCy Spans that needs to be redacted in doc
Returns
-------
statistics : Formatted Statistics
"""
stats_txt_arr = []
redact_types = [
"mNAME",
"mGENDER",
"mDATE",
"mPHONENUM",
"mADDRESS",
"mCONCEPT"]
for redact_type in redact_types:
redacts_for_type = [(r.start_char, r.end_char)
for r in redactions if r.label_ == redact_type]
stats_txt_arr.append(
f"{redact_type[1:]}: {len(redacts_for_type)} redactions")
for start, end in sorted(redacts_for_type):
stats_txt_arr.append(
f"\tStart: {start}, End: {end} => {doc.text[start: end]}")
return "\n".join(stats_txt_arr)