正则表达式


正则表达式

工具网站

正则表达式

re模块

import re

content = 'Hello 123 4567 World_This is a Regex Demo'
print(len(content))
# Hello开头 \s 所有空字符 \d 数字 \s \d{4} 四个数字 \w 字母数字及下划线 .代替任意字符 * 一个或者多个 Demo结尾
result = re.match("^Hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$",content)
print(result)
print(result.group())
print(result.span())
41
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)

泛匹配

import re

content = 'Hello 123 4567 World_This is a Regex Demo'
print(len(content))
# Hello开头 \s 所有空字符 \d 数字 \s \d{4} 四个数字 \w 字母数字及下划线 .代替任意字符 * 一个或者多个 Demo结尾
result = re.match("^Hello.*Demo$",content)
print(result)
print(result.group())
print(result.span())
41
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)

匹配目标

import re

content = 'Hello 123 4567 World_This is a Regex Demo'

result = re.match('^Hello\s(\d+\s\d+)\sWorld.*Demo$',content)
print(result)
print(result.group(0))
print(result.group(1))
print(result.span())
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
123 4567
(0, 41)

贪婪匹配

import re

content = 'Hello 123 4567 World_This is a Regex Demo'

result = re.match('^He.*(\d+\s\d+).*Demo$',content)
print(result)
print(result.group(0))
print(result.group(1))
print(result.span())
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
3 4567
(0, 41)

非贪婪匹配

匹配尽可能少的字符 获取的结果尽可能多

import re

content = 'Hello 123 4567 World_This is a Regex Demo'

result = re.match('^He.*?(\d+\s\d+).*Demo$',content)
print(result)
print(result.group(0))
print(result.group(1))
print(result.span())
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
123 4567
(0, 41)

匹配模式

import re

content = '''Hello 123 4567 
World_This is a Regex Demo'''

result = re.match('^He.*?(\d+\s\d+).*?Demo$',content,re.S)
# . 不能匹配换行符 添加 re.S
print(result)

print(result.group(0))
print(result.group(1))
print(result.span())
<re.Match object; span=(0, 42), match='Hello 123 4567 \nWorld_This is a Regex Demo'>
Hello 123 4567 
World_This is a Regex Demo
123 4567
(0, 42)

转义

import re

content = 'price is $5.00'
r = re.match('price is \$5\.00',content)
print(r)
<re.Match object; span=(0, 14), match='price is $5.00'>

$\color{red}{尽量使用泛匹配,使用货号得到匹配目标,尽量使用非贪婪模式,有换行就用 re.S}$

re.Search

扫描匹配

import re

content = 'XHello 123 4567 World_This is a Regex Demo'
print(len(content))
result = re.match("Hello.*Demo$",content)
print(result)
42
None

content = 'XHello 123 4567 World_This is a Regex Demo'
print(len(content))
# Hello开头 \s 所有空字符 \d 数字 \s \d{4} 四个数字 \w 字母数字及下划线 .代替任意字符 * 一个或者多个 Demo结尾
result = re.search("Hello.*Demo$",content)
print(result)
print(result.group())
print(result.span())
42
<re.Match object; span=(1, 42), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(1, 42)

re.findall

re.compile

匹配联系练习

import re 


html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
        <a href="/3.mp3" singer="齐秦">往事随风</a>
    </li>
    <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </i> <li data一view=*5">
        <a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
    </li>
</ul>
</div>"""

results = re.search('<li.*active.*?singer="(.*?)">(.*?)</a>',html,re.S)

if results:
    print(results.group(0)+"\n------")
    print(results.group(1),results.group(2))
<li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
        <a href="/3.mp3" singer="齐秦">往事随风</a>
------
齐秦 往事随风
import re 


html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
    <a href="/3.mp3" singer="齐秦">往事随风</a>
    </li>
    <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </i> <li data一view=*5">
    <a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
    </li>
</ul>
</div>"""

results = re.search('<li.*?singer="(.*?)">(.*?)</a>',html,re.S)

if results:
    print(results.group(0)+"\n------")
    print(results.group(1),results.group(2))
<li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
------
任贤齐 沧海一声笑
import re 


html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
    <a href="/3.mp3" singer="齐秦">往事随风</a>
    </li>
    <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </i> <li data一view=*5">
    <a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
    </li>
</ul>
</div>"""

results = re.search('<li.*?singer="(.*?)">(.*?)</a>',html)

if results:
    print(results.group(0)+"\n------")
    print(results.group(1),results.group(2))
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a>
------
beyond 光辉岁月
import re 


html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
    <a href="/3.mp3" singer="齐秦">往事随风</a>
    </li>
    <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </i> <li data一view=*5">
    <a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
    </li>
</ul>
</div>"""

results = re.findall('<li.*?href="(.*?)"\ssinger="(.*?)">(.*?)</a>',html,re.S)

if results:
    for result in results:
        print(result[0],result[1],result[2])
/2.mp3 任贤齐 沧海一声笑
/3.mp3 齐秦 往事随风
/4.mp3 beyond 光辉岁月
/5.mp3 陈慧琳 记事本
import re 


html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
    <a href="/3.mp3" singer="齐秦">往事随风</a>
    </li>
    <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </li> <li data一view=*5">
    <a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
    </li>
</ul>
</div>"""

results = re.findall('<li.*?>\s*?(<a.*?>)?(\w+)(</a>)?\s*?</li>',html,re.S)

if results:
    for result in results:
        print(result)
('', '一路上有你', '')
('<a href="/2.mp3" singer="任贤齐">', '沧海一声笑', '</a>')
('<a href="/3.mp3" singer="齐秦">', '往事随风', '</a>')
('<a href="/4.mp3" singer="beyond">', '光辉岁月', '</a>')
('<a href="/5.mp3" singer="陈慧琳">', '记事本', '</a>')
('<a href=*/6.mp3" singer="邓丽君">', '但愿人长久', '</a>')

re.sub

替换字符串中每一个匹配的子串后返回替换后的字符串

import re 

content = 'Hello World!'

content = re.sub ('o',"O",content)

print(content)
HellO WOrld!
import re 

content = 'Hello World!'

content = re.sub ('(World!)',r'\1 你好!',content)

print(content)
Hello World! 你好!
import re


html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
    <a href="/3.mp3" singer="齐秦">往事随风</a>
    </li>
    <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </li> <li data一view=*5">
    <a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
    </li>
</ul>
</div>"""

html = re.sub("<a .*?>|</a>",'',html)
print(html)
r = re.findall('<li.*?>(.*?)</li>',html,re.S)

for i in r:
    print(i.strip())
<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        沧海一声笑
    </li>
    <li data-view="4" class="active">
    往事随风
    </li>
    <li data-view="6">光辉岁月</li><li data一view="5">记事本 </li> <li data一view=*5">
    但愿人长久
    </li>
</ul>
</div>
一路上有你
沧海一声笑
往事随风
光辉岁月
记事本
但愿人长久

re.compile

将正则表达式串编译成正则表达对象,方便复用

import re

content = 'Hello 123 4567 World_This is a Regex Demo'
print(len(content))

pattern = re.compile("^Hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$",re.S)

result = re.match(pattern,content)
print(result)
print(result.group())
print(result.span())
41
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)

实战练习

import re 
import requests

headers = {
    'User-Agent':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}


html = requests.get('https://book.douban.com/', headers=headers)
print(html.status_code)
html = html.text
pattern = re.compile('<li.*?cover.*?href="(.*?)"\stitle="(.*?)".*?author">(.*?)</div>.*?year">(.*?)</span>.*?publisher">(.*?)</span>.*?</li>',re.S)

r = re.findall(pattern,html)
number = 0
for x in r:
    for i in x:
        print(i.strip())
    print("--------------")
    number += 1

print(number)
200
--------------
https://book.douban.com/subject/34937425/?icn=index-latestbook-subject
在路上
[美] 杰克·凯鲁亚克
2020-3
云南人民出版社
--------------
https://book.douban.com/subject/34873195/?icn=index-latestbook-subject
小津安二郎全日记
[日] 小津安二郎&nbsp;/&nbsp;[日] 田中真澄
2020-2
上海译文出版社
--------------
https://book.douban.com/subject/34840588/?icn=index-latestbook-subject
望乡
[日] 凑佳苗
2020-2-1
文治图书·四川文艺出版社
--------------
https://book.douban.com/subject/34948397/?icn=index-latestbook-subject
巴黎记
于坚
2020-2-1
江苏凤凰文艺出版社/楚尘文化
--------------
40

其他

  • \w = [A-Za-z0-9_]

  • 汉字: [\u4E00-\u9FA5]

  • []:含有

  • [^]: 不含有

  • [a-z]:包含小写字符

image-20210829230649412


文章作者: KawYang
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 KawYang !
评论
  目录