-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_spacy.py
54 lines (43 loc) · 1.66 KB
/
example_spacy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import requests
from bs4 import BeautifulSoup
# Load spaCy's German model
import spacy
nlp = spacy.load('de_core_news_sm')
# Define the URL of the website you want to scrape
url = 'https://www.arbeitsagentur.de/familie-und-kinder/kinderzuschlag-verstehen/kinderzuschlag-anspruch-hoehe-dauer'
# Send a GET request to the website
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the content of the request with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Extract content from specific HTML elements (modify selectors as necessary)
eligibility_conditions = []
for p in soup.select('div.edi-text p'):
text = p.get_text(strip=True)
eligibility_conditions.append(text)
# Print extracted conditions
for condition in eligibility_conditions:
print(condition)
# Define SHACL shapes
shapes_template = """
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix ex: <http://example.org/> .
ex:EligibilityShape a sh:NodeShape ;
sh:targetClass ex:Person ;
sh:property [
sh:path ex:condition ;
sh:description "{description}" ;
] .
"""
# Convert conditions to SHACL format
shacl_shapes = ""
for i, condition in enumerate(eligibility_conditions, start=1):
shape = shapes_template.replace("{description}", condition)
shacl_shapes += shape
# Save to a file
with open('eligibility_conditions_shacl.ttl', 'w', encoding='utf-8') as file:
file.write(shacl_shapes)
print("SHACL shapes created successfully!")
else:
print(f'Failed to retrieve the webpage. Status code: {response.status_code}')