正则表达式
re模块
import re
content = 'Hello 123 4567 World_This is a Regex Demo'
print(len(content))
# Hello开头 \s 所有空字符 \d 数字 \s \d{4} 四个数字 \w 字母数字及下划线 .代替任意字符 * 一个或者多个 Demo结尾
result = re.match("^Hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$",content)
print(result)
print(result.group())
print(result.span())
41
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)
泛匹配
import re
content = 'Hello 123 4567 World_This is a Regex Demo'
print(len(content))
# Hello开头 \s 所有空字符 \d 数字 \s \d{4} 四个数字 \w 字母数字及下划线 .代替任意字符 * 一个或者多个 Demo结尾
result = re.match("^Hello.*Demo$",content)
print(result)
print(result.group())
print(result.span())
41
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)
匹配目标
import re
content = 'Hello 123 4567 World_This is a Regex Demo'
result = re.match('^Hello\s(\d+\s\d+)\sWorld.*Demo$',content)
print(result)
print(result.group(0))
print(result.group(1))
print(result.span())
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
123 4567
(0, 41)
贪婪匹配
import re
content = 'Hello 123 4567 World_This is a Regex Demo'
result = re.match('^He.*(\d+\s\d+).*Demo$',content)
print(result)
print(result.group(0))
print(result.group(1))
print(result.span())
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
3 4567
(0, 41)
非贪婪匹配
匹配尽可能少的字符 获取的结果尽可能多
import re
content = 'Hello 123 4567 World_This is a Regex Demo'
result = re.match('^He.*?(\d+\s\d+).*Demo$',content)
print(result)
print(result.group(0))
print(result.group(1))
print(result.span())
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
123 4567
(0, 41)
匹配模式
import re
content = '''Hello 123 4567
World_This is a Regex Demo'''
result = re.match('^He.*?(\d+\s\d+).*?Demo$',content,re.S)
# . 不能匹配换行符 添加 re.S
print(result)
print(result.group(0))
print(result.group(1))
print(result.span())
<re.Match object; span=(0, 42), match='Hello 123 4567 \nWorld_This is a Regex Demo'>
Hello 123 4567
World_This is a Regex Demo
123 4567
(0, 42)
转义
import re
content = 'price is $5.00'
r = re.match('price is \$5\.00',content)
print(r)
<re.Match object; span=(0, 14), match='price is $5.00'>
$\color{red}{尽量使用泛匹配,使用货号得到匹配目标,尽量使用非贪婪模式,有换行就用 re.S}$
re.Search
扫描匹配
import re
content = 'XHello 123 4567 World_This is a Regex Demo'
print(len(content))
result = re.match("Hello.*Demo$",content)
print(result)
42
None
content = 'XHello 123 4567 World_This is a Regex Demo'
print(len(content))
# Hello开头 \s 所有空字符 \d 数字 \s \d{4} 四个数字 \w 字母数字及下划线 .代替任意字符 * 一个或者多个 Demo结尾
result = re.search("Hello.*Demo$",content)
print(result)
print(result.group())
print(result.span())
42
<re.Match object; span=(1, 42), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(1, 42)
re.findall
re.compile
匹配联系练习
import re
html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </i> <li data一view=*5">
<a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>"""
results = re.search('<li.*active.*?singer="(.*?)">(.*?)</a>',html,re.S)
if results:
print(results.group(0)+"\n------")
print(results.group(1),results.group(2))
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
------
齐秦 往事随风
import re
html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </i> <li data一view=*5">
<a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>"""
results = re.search('<li.*?singer="(.*?)">(.*?)</a>',html,re.S)
if results:
print(results.group(0)+"\n------")
print(results.group(1),results.group(2))
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
------
任贤齐 沧海一声笑
import re
html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </i> <li data一view=*5">
<a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>"""
results = re.search('<li.*?singer="(.*?)">(.*?)</a>',html)
if results:
print(results.group(0)+"\n------")
print(results.group(1),results.group(2))
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a>
------
beyond 光辉岁月
import re
html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </i> <li data一view=*5">
<a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>"""
results = re.findall('<li.*?href="(.*?)"\ssinger="(.*?)">(.*?)</a>',html,re.S)
if results:
for result in results:
print(result[0],result[1],result[2])
/2.mp3 任贤齐 沧海一声笑
/3.mp3 齐秦 往事随风
/4.mp3 beyond 光辉岁月
/5.mp3 陈慧琳 记事本
import re
html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </li> <li data一view=*5">
<a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>"""
results = re.findall('<li.*?>\s*?(<a.*?>)?(\w+)(</a>)?\s*?</li>',html,re.S)
if results:
for result in results:
print(result)
('', '一路上有你', '')
('<a href="/2.mp3" singer="任贤齐">', '沧海一声笑', '</a>')
('<a href="/3.mp3" singer="齐秦">', '往事随风', '</a>')
('<a href="/4.mp3" singer="beyond">', '光辉岁月', '</a>')
('<a href="/5.mp3" singer="陈慧琳">', '记事本', '</a>')
('<a href=*/6.mp3" singer="邓丽君">', '但愿人长久', '</a>')
re.sub
替换字符串中每一个匹配的子串后返回替换后的字符串
import re
content = 'Hello World!'
content = re.sub ('o',"O",content)
print(content)
HellO WOrld!
import re
content = 'Hello World!'
content = re.sub ('(World!)',r'\1 你好!',content)
print(content)
Hello World! 你好!
import re
html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </li> <li data一view=*5">
<a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>"""
html = re.sub("<a .*?>|</a>",'',html)
print(html)
r = re.findall('<li.*?>(.*?)</li>',html,re.S)
for i in r:
print(i.strip())
<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
沧海一声笑
</li>
<li data-view="4" class="active">
往事随风
</li>
<li data-view="6">光辉岁月</li><li data一view="5">记事本 </li> <li data一view=*5">
但愿人长久
</li>
</ul>
</div>
一路上有你
沧海一声笑
往事随风
光辉岁月
记事本
但愿人长久
re.compile
将正则表达式串编译成正则表达对象,方便复用
import re
content = 'Hello 123 4567 World_This is a Regex Demo'
print(len(content))
pattern = re.compile("^Hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$",re.S)
result = re.match(pattern,content)
print(result)
print(result.group())
print(result.span())
41
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)
实战练习
import re
import requests
headers = {
'User-Agent':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}
html = requests.get('https://book.douban.com/', headers=headers)
print(html.status_code)
html = html.text
pattern = re.compile('<li.*?cover.*?href="(.*?)"\stitle="(.*?)".*?author">(.*?)</div>.*?year">(.*?)</span>.*?publisher">(.*?)</span>.*?</li>',re.S)
r = re.findall(pattern,html)
number = 0
for x in r:
for i in x:
print(i.strip())
print("--------------")
number += 1
print(number)
200
--------------
https://book.douban.com/subject/34937425/?icn=index-latestbook-subject
在路上
[美] 杰克·凯鲁亚克
2020-3
云南人民出版社
--------------
https://book.douban.com/subject/34873195/?icn=index-latestbook-subject
小津安二郎全日记
[日] 小津安二郎 / [日] 田中真澄
2020-2
上海译文出版社
--------------
https://book.douban.com/subject/34840588/?icn=index-latestbook-subject
望乡
[日] 凑佳苗
2020-2-1
文治图书·四川文艺出版社
--------------
https://book.douban.com/subject/34948397/?icn=index-latestbook-subject
巴黎记
于坚
2020-2-1
江苏凤凰文艺出版社/楚尘文化
--------------
40
其他
\w = [A-Za-z0-9_]
汉字: [\u4E00-\u9FA5]
[]:含有
[^]: 不含有
[a-z]:包含小写字符