forked from qalhata/Python-Scripts-Repo-on-Data-Science
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHTML_with_BeautifulSoup_GetTextData.py
56 lines (37 loc) · 1.3 KB
/
HTML_with_BeautifulSoup_GetTextData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 16 00:01:32 2017
@author: Shabaka
"""
# Import packages
import requests
from bs4 import BeautifulSoup
# Specify url: url
url = 'https://www.python.org/~guido/'
# Package the request, send the request and catch the response: r
r = requests.get(url)
# Extract the response as html: html_doc
html_doc = r.text
# Create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc) # presents page in a readable manner
soup.body.text
bold = soup.finaAll('b') # find all bold text and return a list
print(bold)
print(soup.prettify())
# Get the title of Guido's webpage: guido_title
guido_title = (soup.title)
# Print the title of Guido's webpage to the shell
print(guido_title)
# Get Guido's text: guido_text
guido_text = (soup.get_text())
# Print Guido's text to the shell
print(guido_text)
soup.findAll(id-"para2")[0].text
soup.findAll(['b', 'p'])
soup.findAll({'b': True, 'p': True})
# find all links in the document
links = soup.find('a') # retruns 1st match it gets -use findAll
print(links['href'] + " is the url and " + links.text + " is the text")
# Use find in various ways
# findParents, findNextSiblings, findPreviousSiblings
# findNext, findPrevious and findAllNext and findAllPrevious