Python 爬虫

Python

发布日期: 2020-08-29

文章字数: 3.4k

阅读时长: 18 分

阅读次数:

正则表达式

工具网站

正则表达式

re模块

import re

content = 'Hello 123 4567 World_This is a Regex Demo'
print(len(content))
# Hello开头 \s 所有空字符 \d 数字 \s \d{4} 四个数字 \w 字母数字及下划线 .代替任意字符 * 一个或者多个 Demo结尾
result = re.match("^Hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$",content)
print(result)
print(result.group())
print(result.span())

41
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)

泛匹配

import re

content = 'Hello 123 4567 World_This is a Regex Demo'
print(len(content))
# Hello开头 \s 所有空字符 \d 数字 \s \d{4} 四个数字 \w 字母数字及下划线 .代替任意字符 * 一个或者多个 Demo结尾
result = re.match("^Hello.*Demo$",content)
print(result)
print(result.group())
print(result.span())

41
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)

匹配目标

import re

content = 'Hello 123 4567 World_This is a Regex Demo'

result = re.match('^Hello\s(\d+\s\d+)\sWorld.*Demo$',content)
print(result)
print(result.group(0))
print(result.group(1))
print(result.span())

<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
123 4567
(0, 41)

贪婪匹配

import re

content = 'Hello 123 4567 World_This is a Regex Demo'

result = re.match('^He.*(\d+\s\d+).*Demo$',content)
print(result)
print(result.group(0))
print(result.group(1))
print(result.span())

<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
3 4567
(0, 41)

非贪婪匹配

匹配尽可能少的字符获取的结果尽可能多

import re

content = 'Hello 123 4567 World_This is a Regex Demo'

result = re.match('^He.*?(\d+\s\d+).*Demo$',content)
print(result)
print(result.group(0))
print(result.group(1))
print(result.span())

<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
123 4567
(0, 41)

匹配模式

import re

content = '''Hello 123 4567 
World_This is a Regex Demo'''

result = re.match('^He.*?(\d+\s\d+).*?Demo$',content,re.S)
# . 不能匹配换行符 添加 re.S
print(result)

print(result.group(0))
print(result.group(1))
print(result.span())

<re.Match object; span=(0, 42), match='Hello 123 4567 \nWorld_This is a Regex Demo'>
Hello 123 4567 
World_This is a Regex Demo
123 4567
(0, 42)

转义

import re

content = 'price is $5.00'
r = re.match('price is \$5\.00',content)
print(r)

<re.Match object; span=(0, 14), match='price is $5.00'>

$\color{red}{尽量使用泛匹配，使用货号得到匹配目标，尽量使用非贪婪模式，有换行就用 re.S}$

re.Search

扫描匹配

import re

content = 'XHello 123 4567 World_This is a Regex Demo'
print(len(content))
result = re.match("Hello.*Demo$",content)
print(result)

42
None


content = 'XHello 123 4567 World_This is a Regex Demo'
print(len(content))
# Hello开头 \s 所有空字符 \d 数字 \s \d{4} 四个数字 \w 字母数字及下划线 .代替任意字符 * 一个或者多个 Demo结尾
result = re.search("Hello.*Demo$",content)
print(result)
print(result.group())
print(result.span())

42
<re.Match object; span=(1, 42), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(1, 42)

re.findall

re.compile

匹配联系练习

import re 


html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
        <a href="/3.mp3" singer="齐秦">往事随风</a>
    </li>
    <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </i> <li data一view=*5">
        <a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
    </li>
</ul>
</div>"""

results = re.search('<li.*active.*?singer="(.*?)">(.*?)</a>',html,re.S)

if results:
    print(results.group(0)+"\n------")
    print(results.group(1),results.group(2))

<li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
        <a href="/3.mp3" singer="齐秦">往事随风</a>
------
齐秦 往事随风

import re 


html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
    <a href="/3.mp3" singer="齐秦">往事随风</a>
    </li>
    <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </i> <li data一view=*5">
    <a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
    </li>
</ul>
</div>"""

results = re.search('<li.*?singer="(.*?)">(.*?)</a>',html,re.S)

if results:
    print(results.group(0)+"\n------")
    print(results.group(1),results.group(2))

<li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
------
任贤齐 沧海一声笑

import re 


html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
    <a href="/3.mp3" singer="齐秦">往事随风</a>
    </li>
    <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </i> <li data一view=*5">
    <a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
    </li>
</ul>
</div>"""

results = re.search('<li.*?singer="(.*?)">(.*?)</a>',html)

if results:
    print(results.group(0)+"\n------")
    print(results.group(1),results.group(2))

<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a>
------
beyond 光辉岁月

import re 


html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
    <a href="/3.mp3" singer="齐秦">往事随风</a>
    </li>
    <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </i> <li data一view=*5">
    <a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
    </li>
</ul>
</div>"""

results = re.findall('<li.*?href="(.*?)"\ssinger="(.*?)">(.*?)</a>',html,re.S)

if results:
    for result in results:
        print(result[0],result[1],result[2])

/2.mp3 任贤齐 沧海一声笑
/3.mp3 齐秦 往事随风
/4.mp3 beyond 光辉岁月
/5.mp3 陈慧琳 记事本

import re 


html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
    <a href="/3.mp3" singer="齐秦">往事随风</a>
    </li>
    <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </li> <li data一view=*5">
    <a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
    </li>
</ul>
</div>"""

results = re.findall('<li.*?>\s*?(<a.*?>)?(\w+)(</a>)?\s*?</li>',html,re.S)

if results:
    for result in results:
        print(result)

('', '一路上有你', '')
('<a href="/2.mp3" singer="任贤齐">', '沧海一声笑', '</a>')
('<a href="/3.mp3" singer="齐秦">', '往事随风', '</a>')
('<a href="/4.mp3" singer="beyond">', '光辉岁月', '</a>')
('<a href="/5.mp3" singer="陈慧琳">', '记事本', '</a>')
('<a href=*/6.mp3" singer="邓丽君">', '但愿人长久', '</a>')

re.sub

替换字符串中每一个匹配的子串后返回替换后的字符串

import re 

content = 'Hello World!'

content = re.sub ('o',"O",content)

print(content)

HellO WOrld!

import re 

content = 'Hello World!'

content = re.sub ('(World!)',r'\1 你好!',content)

print(content)

Hello World! 你好!

import re


html = """<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
    </li>
    <li data-view="4" class="active">
    <a href="/3.mp3" singer="齐秦">往事随风</a>
    </li>
    <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li><li data一view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a> </li> <li data一view=*5">
    <a href=*/6.mp3" singer="邓丽君">但愿人长久</a>
    </li>
</ul>
</div>"""

html = re.sub("<a .*?>|</a>",'',html)
print(html)
r = re.findall('<li.*?>(.*?)</li>',html,re.S)

for i in r:
    print(i.strip())

<div id="songs-list'>
<h2 class="title">经典老歌</h2>
<p class="introduction">
    经典老歌列表
</p>
<ul id="list" class="list-group">
    <li data-view="2">一路上有你</li>
    <li data-view="7">
        沧海一声笑
    </li>
    <li data-view="4" class="active">
    往事随风
    </li>
    <li data-view="6">光辉岁月</li><li data一view="5">记事本 </li> <li data一view=*5">
    但愿人长久
    </li>
</ul>
</div>
一路上有你
沧海一声笑
往事随风
光辉岁月
记事本
但愿人长久

re.compile

将正则表达式串编译成正则表达对象，方便复用

import re

content = 'Hello 123 4567 World_This is a Regex Demo'
print(len(content))

pattern = re.compile("^Hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$",re.S)

result = re.match(pattern,content)
print(result)
print(result.group())
print(result.span())

41
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)

实战练习

import re 
import requests

headers = {
    'User-Agent':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}


html = requests.get('https://book.douban.com/', headers=headers)
print(html.status_code)
html = html.text
pattern = re.compile('<li.*?cover.*?href="(.*?)"\stitle="(.*?)".*?author">(.*?)</div>.*?year">(.*?)</span>.*?publisher">(.*?)</span>.*?</li>',re.S)

r = re.findall(pattern,html)
number = 0
for x in r:
    for i in x:
        print(i.strip())
    print("--------------")
    number += 1

print(number)

200
--------------
https://book.douban.com/subject/34937425/?icn=index-latestbook-subject
在路上
[美] 杰克·凯鲁亚克
2020-3
云南人民出版社
--------------
https://book.douban.com/subject/34873195/?icn=index-latestbook-subject
小津安二郎全日记
[日] 小津安二郎&nbsp;/&nbsp;[日] 田中真澄
2020-2
上海译文出版社
--------------
https://book.douban.com/subject/34840588/?icn=index-latestbook-subject
望乡
[日] 凑佳苗
2020-2-1
文治图书·四川文艺出版社
--------------
https://book.douban.com/subject/34948397/?icn=index-latestbook-subject
巴黎记
于坚
2020-2-1
江苏凤凰文艺出版社/楚尘文化
--------------
40