0%

爬虫

爬虫:https://www.cnblogs.com/wupeiqi/articles/5354900.html
详细连接:https://www.cnblogs.com/wupeiqi/articles/6283017.html

一、爬虫基本操作

1.1 爬虫

  • 定向
  • 非定向

1.2 步骤

  • 下载页面
  • 筛选:正则表达式

1.3 开源模块

  • requests
    • response = request.get()
    • response.txt
  • beautifulsoup
    • bs = Beautifulsoup(request.txt)
    • bs.find(id = “”)

二、requests库

Requests 是使用 Apache2 Licensed 许可证的 基于Python开发的HTTP 库,其在Python内置模块的基础上进行了高度的封装,从而使得Pythoner进行网络请求时,变得美好了许多,使用Requests可以轻而易举的完成浏览器可有的任何操作。

1.Get请求

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 1、无参数实例

import requests

ret = requests.get('https://github.com/timeline.json')

print ret.url
print ret.text



# 2、有参数实例

import requests

payload = {'key1': 'value1', 'key2': 'value2'}
ret = requests.get("http://httpbin.org/get", params=payload)

print ret.url
print ret.text

2.POST请求

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 1、基本POST实例

import requests

payload = {'key1': 'value1', 'key2': 'value2'}
ret = requests.post("http://httpbin.org/post", data=payload)

print ret.text


# 2、发送请求头和数据实例

import requests
import json

url = 'https://api.github.com/some/endpoint'
payload = {'some': 'data'}
headers = {'content-type': 'application/json'}

ret = requests.post(url, data=json.dumps(payload), headers=headers)

print ret.text
print ret.cookies

总结

1
2
3
4
5
6
7
8
9
10
11
import requests
response =resquests.get(url = "")
response = requests.post(url = "",data={"":""},cookies={"asdf":"asfdasdf"})

response.content #字节内容
response.encoding #编码方式
response.apparent_encoding
response.text #文本内容
response.status_code # 200,300........
response.cookies # cookies对象
response.cookies.get_dict() #cookies字典

详细

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def param_method_url():
# requests.request(method='get', url='http://127.0.0.1:8000/test/')
# requests.request(method='post', url='http://127.0.0.1:8000/test/')
pass


def param_param():
# - 可以是字典
# - 可以是字符串
# - 可以是字节(ascii编码以内)

# requests.request(method='get',
# url='http://127.0.0.1:8000/test/',
# params={'k1': 'v1', 'k2': '水电费'})

# requests.request(method='get',
# url='http://127.0.0.1:8000/test/',
# params="k1=v1&k2=水电费&k3=v3&k3=vv3")

# requests.request(method='get',
# url='http://127.0.0.1:8000/test/',
# params=bytes("k1=v1&k2=k2&k3=v3&k3=vv3", encoding='utf8'))

# 错误
# requests.request(method='get',
# url='http://127.0.0.1:8000/test/',
# params=bytes("k1=v1&k2=水电费&k3=v3&k3=vv3", encoding='utf8'))
pass


def param_data():
# 可以是字典
# 可以是字符串
# 可以是字节
# 可以是文件对象

# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# data={'k1': 'v1', 'k2': '水电费'})

# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# data="k1=v1; k2=v2; k3=v3; k3=v4"
# )

# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# data="k1=v1;k2=v2;k3=v3;k3=v4",
# headers={'Content-Type': 'application/x-www-form-urlencoded'}
# )

# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# data=open('data_file.py', mode='r', encoding='utf-8'), # 文件内容是:k1=v1;k2=v2;k3=v3;k3=v4
# headers={'Content-Type': 'application/x-www-form-urlencoded'}
# )
pass


def param_json():
# 将json中对应的数据进行序列化成一个字符串,json.dumps(...)
# 然后发送到服务器端的body中,并且Content-Type是 {'Content-Type': 'application/json'}
requests.request(method='POST',
url='http://127.0.0.1:8000/test/',
json={'k1': 'v1', 'k2': '水电费'})


def param_headers():
# 发送请求头到服务器端
requests.request(method='POST',
url='http://127.0.0.1:8000/test/',
json={'k1': 'v1', 'k2': '水电费'},
headers={'Content-Type': 'application/x-www-form-urlencoded'}
)


def param_cookies():
# 发送Cookie到服务器端
requests.request(method='POST',
url='http://127.0.0.1:8000/test/',
data={'k1': 'v1', 'k2': 'v2'},
cookies={'cook1': 'value1'},
)
# 也可以使用CookieJar(字典形式就是在此基础上封装)
from http.cookiejar import CookieJar
from http.cookiejar import Cookie

obj = CookieJar()
obj.set_cookie(Cookie(version=0, name='c1', value='v1', port=None, domain='', path='/', secure=False, expires=None,
discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False,
port_specified=False, domain_specified=False, domain_initial_dot=False, path_specified=False)
)
requests.request(method='POST',
url='http://127.0.0.1:8000/test/',
data={'k1': 'v1', 'k2': 'v2'},
cookies=obj)


def param_files():
# 发送文件(字典)
# file_dict = {
# 'f1': open('readme', 'rb')
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)

# 发送文件,定制文件名(以元组作为值)
# file_dict = {
# 'f1': ('test.txt', open('readme', 'rb'))
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)

# 发送文件,定制文件名(文件内容通过自己输入)
# file_dict = {
# 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)

# 发送文件,定制文件名
# file_dict = {
# 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'})
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)

pass


def param_auth():
from requests.auth import HTTPBasicAuth, HTTPDigestAuth

ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))
print(ret.text)

# ret = requests.get('http://192.168.1.1',
# auth=HTTPBasicAuth('admin', 'admin')) ##用户名和密码进行BASIC64认证,一般用不到
# ret.encoding = 'gbk'
# print(ret.text)

# ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
# print(ret)
#


def param_timeout():
# ret = requests.get('http://google.com/', timeout=1)
# print(ret)

# ret = requests.get('http://google.com/', timeout=(5, 1))
# print(ret)
pass


def param_allow_redirects():
ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)
print(ret.text)


# http是指通过socket.直接发消息没有加密
# https有ssl加密。

# verify=False忽略证书的存在
# cert = 证书文件 发送证书

def param_proxies():
# proxies = {
# "http": "61.172.249.96:80",
# "https": "http://61.185.219.126:3128",
# }

# proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'}

# ret = requests.get("http://www.proxy360.cn/Proxy", proxies=proxies)
# print(ret.headers)


# from requests.auth import HTTPProxyAuth
#
# proxyDict = {
# 'http': '77.75.105.165',
# 'https': '77.75.105.165'
# }
# auth = HTTPProxyAuth('username', 'mypassword')
#
# r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)
# print(r.text)

pass


def param_stream():
ret = requests.get('http://127.0.0.1:8000/test/', stream=True)
print(ret.content)
ret.close()

# from contextlib import closing
# with closing(requests.get('http://httpbin.org/get', stream=True)) as r:
# # 在此处理响应。为True时分段传输
# for i in r.iter_content():
# print(i)

#session自动对cookie,请求头等进行处理,(保存客户端历史访问信息)
def requests_session():
import requests

session = requests.Session()

### 1、首先登陆任何页面,获取cookie

i1 = session.get(url="http://dig.chouti.com/help/service")

### 2、用户登陆,携带上一次的cookie,后台对cookie中的 gpsd 进行授权
i2 = session.post(
url="http://dig.chouti.com/login",
data={
'phone': "8615131255089",
'password': "xxxxxx",
'oneMonth': ""
}
)

i3 = session.post(
url="http://dig.chouti.com/link/vote?linksId=8589623",
)
print(i3.text)

参数示例

三、beautifulsoup4库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text,feature = "html.parser")
## feature解析引擎,html.parser为python自带引擎
## lxml为第三方引擎,需要安装第三方库
target = soup.find(id="auto-channel-lazyload-article")

li_list = target.find_all("li") #根据标签查找所有的li,,, find查找第一个


for li_item in li_list:
a_item = li_item.find("a")
if a_item:
href_item = "https:" + a_item.attrs.get("href")
# attrs 获得标签属性字典
# get获得标签的某个属性
print(href_item)

txt = a_item.find("h3").text # a_item.find("img")得到的对象为tag对象,得到text需要 a_item.find("img").text
img_url = "https:" + a_item.find("img").attrs.get("src")
print(img_url)

import re
# 将网址中的www3等替换为www
mod = re.compile("^https://www\d")
img_url = re.sub(mod, "https://www", img_url)
# 下载并保存图片
img = requests.get(url=img_url)
import uuid

file_name = str(uuid.uuid4()) + ".png"
with open(file_name, "wb") as f:
f.write(img.content) # .content 是以字节储存的内容

总结:

1
2
3
4
5
6
7
8
9
10
11
12
13
soup = BeautifulSoup(response.text,feature = "html.parser")
#查询一个
v1 = soup.find("div") #标签查询
v1 = soup.find(id = "test") #属性查询
v1 = soup.find("div"id ="test") # 组合查询
#查询所有
v2 = soup.find_all("div")

obj = v1
obj = v2[0]

obj.text #内容
obj.attrs #属性

自动登录

点击登录,若页面刷新,以form表单提交,若为刷新,以ajax提交

form提交和ajax提交https://blog.csdn.net/gghh2015/article/details/79116718

模块详细

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import requests

post_dict = {"phone": 1231234124,
"password": "asdfasdf",
"loginType": 2,
}
response = requests.post(
url="https://dig.chouti.com/login",
data=post_dict
)

response.text
cookies_dict = response.cookies.get_dict()
## 一般情况只需要将url和cookies发过去
requests.get(url="asfdasf", cookies=cookies_dict)
## 查看操作时的request头
c. 模块详细使用
    requests
    
    - 方法关系
        requests.get(.....)
        requests.post(.....)
        requests.put(.....)
        requests.delete(.....)
        ...
        
        requests.request('POST'...)
    - 参数
        request.request
        - method:  提交方式
        - url:     提交地址
        - params:  在URL中传递的参数,GET 
            requests.request(
                method='GET',
                url= 'http://www.oldboyedu.com',
                params = {'k1':'v1','k2':'v2'}
            )
            # http://www.oldboyedu.com?k1=v1&k2=v2
        - data:    在请求体里传递的数据
        
            requests.request(
                method='POST',
                url= 'http://www.oldboyedu.com',
                params = {'k1':'v1','k2':'v2'},
                data = {'use':'alex','pwd': '123','x':[11,2,3]}  #data中不能有字典
            )
               #如果以data形式传递,request将数据封装为请求头和请求体
            请求头:
                content-type: application/url-form-encod..... 
            请求体:
                use=alex&pwd=123

                 - json   在请求体里传递的数据
            requests.request(
                method='POST',
                url= 'http://www.oldboyedu.com',
                params = {'k1':'v1','k2':'v2'},
                json = {'use':'alex','pwd': '123'}
            )
            请求头:
                content-type: application/json
            请求体:
                "{'use':'alex','pwd': '123'}"
            PS: 字典中嵌套字典时使用json,如果没有嵌套,用data即可
            
        - headers   请求头
        
            requests.request(
                method='POST',
                url= 'http://www.oldboyedu.com',
                params = {'k1':'v1','k2':'v2'},
                json = {'use':'alex','pwd': '123'},
                headers={
                    'Referer': 'http://dig.chouti.com/',  ##上一次访问的网站
                    'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"   ##客户端
                }
            )
         - cookies  Cookies 放到header中发送
                 
         - files    上传文件
         
         - auth      基本认知(headers中加入加密的用户名和密码)
         
         - timeout  请求和响应的超市时间
         
         - allow_redirects  是否允许重定向
         
         - proxies  代理
         
         - verify   是否忽略证书
         
         - cert      证书文件
         
         - stream   村长下大片
         
             - session: 用于保存客户端历史访问信息