forked from purcellconsult/Cracking-Python-Bootcamp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
08_regex_recap_in_python.py
191 lines (141 loc) · 7.01 KB
/
08_regex_recap_in_python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
####################################
# Regular expressions patterns
# ----------------------------
#
# Self documenting examples
# about regular expressions along
# with resources on how to use them
# for better coding.
#
# Python Docs: https://docs.python.org/3/library/re.html
#
# By Doug Purcell
# http://www.purcellconsult.com
#
#####################################
import re
# Simple demo of regexes
print('Simple Demo of Regexes')
print('-----------------------')
phrase = 'Learn how regular expressions work in python.'
match = re.search('python', phrase)
if match:
print('A match is found starting at {}'
' and ending at {}'.format(match.start(), match.end()))
else:
print('No match found!')
start = match.start() # finds where key starts at in string
end = match.end() # finds where key ends in string
print(phrase[start:end]) # python
print()
# Regex functions
# ---------------
# search(pattern, string, flags=0) -> scans the ENTIRE string for a match
# match(pattern, string, flags=0) -> returns a match if 0 or more characters at the BEGINNING of string matches the regex
# fullmatch(pattern, string, flags=0) -> returns a match if the whole string matches the regex
# split(pattern, string, maxsplit=0, flags=0) -> split string by the pattern
# findall(pattern, string, flags=) -> returns non overlapping matches of pattern in string
print('Regex functions')
print('---------------')
print()
the_eagle_poem = """
He clasps the crag with crooked hands;
Close to the sun in lonely lands,
Ring'd with the azure world, he stands.
The wrinkled sea beneath him crawls;
He watches from his mountain walls,
And like a thunderbolt he falls.
"""
find_word = re.search('azure', the_eagle_poem) # <_sre.SRE_Match object; span=(92, 97), match='azure'>
print(find_word)
print(re.match('walls', the_eagle_poem)) # None
print(re.match('', the_eagle_poem)) # <_sre.SRE_Match object; span=(0, 0), match=''>
print(re.fullmatch(the_eagle_poem, the_eagle_poem)) # <_sre.SRE_Match object; span=(0, 227) ...>
print(re.split(';', the_eagle_poem)) # ['\nHe clasps the crag with crooked hands', "\n\nClose to the sun in lonely lands...]
print(re.findall('the', the_eagle_poem))
# Matching single characters
# --------------------------
print()
phrase = 'abcdefghijklmnopqrstuvwxyz'
capital_letters = 'ABCDEFGHABCDEFGHABCDEFGH'
print('Matching Characters')
print('-------------------')
print(re.search('h', phrase)) # <_sre.SRE_Match object; span=(7, 8), match='h'>
print(re.search('b', capital_letters)) # None
print(re.search('b', capital_letters, re.IGNORECASE)) # <_sre.SRE_Match object; span=(1, 2), match='B'>
print(re.findall('C', capital_letters)) # ['C', 'C', 'C']
alpha_pattern = '0123456789abcdefgh'
print(re.search('5', alpha_pattern)) # <_sre.SRE_Match object; span=(5, 6), match='5'>
print(re.search('a', alpha_pattern)) # <_sre.SRE_Match object; span=(10, 11), match='a'>
alpha_pattern_1 = '[0-9][a-z]' # any digit or lowercase letter
print(re.match(alpha_pattern_1, '9')) # None
print(re.search('[a-z]', '9920202022j22929')) # <_sre.SRE_Match object; span=(10, 11), match='j'>
print(re.search('[A-Z]', 'abcdefghijklmn83nOjsksZ')) # <_sre.SRE_Match object; span=(17, 18), match='O'>
print(re.search('[!@#$%^&*()-+{}[]|\;"<>?', 'zvgsggs272292hkOwuyeg%ss')) # <_sre.SRE_Match object; span=(21, 22), match='%'>
print(re.search('[a-zA-Z]', '22838828289020932;/asksk')) # <_sre.SRE_Match object; span=(19, 20), match='a'>
# verifies a sequence of characters in a certain order
print(re.search('[a-z]{11}', '2517abcd17171179Abs20abracadabra2298')) # <_sre.SRE_Match object; span=(21, 32), match='abracadabra'>
print(re.search('[a-z]{1,5}', '2517abcde17171179')) # <_sre.SRE_Match object; span=(4, 9), match='abcde'>
print(re.search('[a-z]{1,5}[0-9]{3}', 'c999')) # <_sre.SRE_Match object; span=(0, 4), match='c999'>
print(re.search('[a-z]{1,5}[0-9]{3}', 'd17')) # None. Why?
print()
# Common Regex Character Classes
# ------------------------------
# \d -> any digit
# \D -> any non digit
# \w -> any word
# \W -> any non alphanumeric character
print('Common Regex Character Classes and Repetitions ')
print('----------------------------------------------')
print(re.search('\d', 'jsjjsk273829BHAja')) # <_sre.SRE_Match object; span=(1, 2), match='2'>
print(re.search('\d\d', '2j8k2c8c34m3ma1')) # <_sre.SRE_Match object; span=(8, 10), match='34'>
print(re.search('\d{3}-\d{3}-\d{4}', '230-392-9327')) # <_sre.SRE_Match object; span=(0, 12), match='230-392-9327'>
# matches any alphanumeric character and the underscore
print(re.search('\w', ';\';,.,.,.283jsns9')) # <_sre.SRE_Match object; span=(9, 10), match='2'>
print(re.search('\w\w\w', '9d1')) # <_sre.SRE_Match object; span=(0, 3), match='9d1'>
# Combining the \d and \w character classes:
print(re.search('\w\d\w\d', '_0_0')) # <_sre.SRE_Match object; span=(0, 4), match='_0_0'>
print()
# Common Regex Symbols
# ------------------------------
# * -> 0 or more repetitions
# + -> 1 or more repetitions
# ? -> 0 or 1 repetitions of the proceeding
# . -> matches any character except for a newline
# $ -> matches the end of the string, or just before the newline
# ^ -> matches the start of the string
print('Common Regex Symbols ')
print('----------------------')
# the * operator matches 0 or more repetitions
print(re.search('\d*', '28392202')) # <_sre.SRE_Match object; span=(0, 8), match='28392202'>
# the + operator matches 1 or more repetitions
print(re.search('aba+', 'abaaaaaa')) # <_sre.SRE_Match object; span=(0, 8), match='abaaaaaa'>
# the ? causes the regex to match 0 or 1 repetitions of the preceding
print(re.search('abc?', 'ab')) # ab validates
print(re.search('abc?', 'abc')) # abc validates. what about ac or bc?
# the . matches any character
print(re.search('.{3}', '/,@')) # <_sre.SRE_Match object; span=(0, 3), match='/,@'>
# $ matches just before the end of newline
print(re.search('^\d\d\d$', '278')) # <_sre.SRE_Match object; span=(0, 3), match='278'>
# $ Matches just before the end of the string.
# If the search key appears before the end of the string, then None returns.
# It makes way more sense with a little code.
print(re.search('zo', 'zoey')) # <_sre.SRE_Match object; span=(0, 2), match='zo'>
print(re.search('zo$', 'zoey')) # None
print(re.search('zo$', 'hey zo')) # <_sre.SRE_Match object; span=(4, 6), match='zo'>
print(re.search('zo$', 'Hey zo! What\'s cracking?')) # None
print()
# ^ matches the start of the string
print(re.search('^Hi', 'Hello, Hi')) # None
print(re.search('^Hi', 'Hi Hello')) # <_sre.SRE_Match object; span=(0, 2), match='Hi'>
print()
# GROUPING
# --------
# [] -> a set of characters
# | -> matches A or B
# () -> matches the regex inside the parentheses
print('GROUPING')
print('--------')
print(re.search('[a-z][A-Z][0-9][\w]', 'aB6H')) # <_sre.SRE_Match object; span=(0, 4), match='aB6H'>
print(re.search('[a-z]|[0-9]|[>]', 'HAKSKS6,./?')) # <_sre.SRE_Match object; span=(6, 7), match='6'>
print(re.search('(aab)(bbc)', 'aabbbc')) # None