Python获取爱卡网所有车型评分及前十页购车用途

介绍

爬虫框架一直没看,也不敢看。怕自己学不会,嘿嘿。
本项目使用的是定向爬虫方式实现。多线程爬取

运行环境

python3
模块 :requests,re,threading,pymongo,gevent,bs4

介绍

进入车型页面,正则匹配所有车型ID。
通过ID构造详情链接,获取详情页面评分,购车用途。
通过分析每页的评价都是通过ajax技术加载的,所有单独写了个函数处理前十页评价。
通过字典方式存储,统计所有购车用途的数量。
采用MongoDB存储。

代码

如大家直接拷贝运行,还请配置MongoDB,IP代理。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
from pymongo import MongoClient
import threading
import gevent
from gevent import monkey

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}

def gethtml(url):
'''获取URL HTML文本'''
proex = {'https':"https://59.37.132.30:8080"}
try:
html = requests.get(url,headers=headers,timeout=15,proxies=proex)
html.encoding = html.apparent_encoding
html.raise_for_status()
return html.text
except BaseException as f:
print('获取主页源码失败了,错误信息为:',f)

def geturls(html):
'''匹配所有车辆URL ID'''
try:
patten = re.compile('a href="(/\d+/)" target="_blank"')
urls = re.findall(patten,html)
return urls
except BaseException as f:
print('获取URL失败,错误信息为:',f)

def getpages(url):
'''获取每款车辆评价数量'''
try:
html = gethtml(url)
soup = BeautifulSoup(html,'lxml')
page = soup.find(class_='unify_page mt20')
pat_f = re.compile('''rel="nofollow">(\d+)</a>''')
pages = re.findall(pat_f,str(page))
return pages
except BaseException as f:
print('获取URL失败,错误信息为:',f)

def getall(pages,url,purposes):
'''构造Ajax请求参数'''
try:
if len(pages) == 0:
date = getinfo(url,purposes)
return date
else:
date = getinfo(url,purposes)
id = url.split('/',4)[3] #切片获取车辆URL ID值
#判断评价页数,大于10页只获取10页数据,小于10全部获取
if int(pages[-1]) > 10:
listone = list(range(2,11))
ta = threading.Thread(target=getmoinfo,args=(listone[:4], id, purposes,))
tb = threading.Thread(target=getmoinfo, args=(listone[4:6], id, purposes,))
tc = threading.Thread(target=getmoinfo, args=(listone[6:9], id, purposes,))
ta.start()
tb.start()
tc.start()
else:
listone = list(range(2,int(pages[-1])))
ta = threading.Thread(target=getmoinfo, args=(listone[:int(pages[-1])],id,purposes,))
ta.start()
return date
except BaseException as f:
print('获取URL失败,错误信息为:',f)

def getmoinfo(page,id,purposes):
'''遍历所有AjaxURL,并获取购车用途'''
data = {'r': 'reputation/reputation/GetAjaxKbList3',
'page': page,
'pserid': id,
'jh': '0',
'wd': '0'}
f_url = 'http://newcar.xcar.com.cn/auto/index.php'
url = requests.get(f_url,params=data).url
try:
html = gethtml(url)
soup = BeautifulSoup(html,'lxml')
#购车用途
purposee = soup.find_all(class_='purpose clearfix')
pat_s = re.compile('<em>(.*?)</em>')
purpose = re.findall(pat_s,str(purposee))
purposes.append(purpose)
except BaseException as f:
print('获取数据失败,错误信息为:',f)

def getinfo(url,purposes):
'''获取车辆综合评分'''
try:
data = {}
html = gethtml(url)
soup = BeautifulSoup(html,'lxml')
pat_o = re.compile('<p>综合评分:<em>(.*?)</em>分</p>')
#综合评分
synthesis = re.findall(pat_o,str(soup))
#车型
title = soup.find(class_='tt_h1').get_text()
#整体评分
infos = soup.find(class_='column')
pat_t = re.compile('\d+\.\d+分')
info = re.findall(pat_t,str(infos))
#购车用途
purposee = soup.find_all(class_='purpose clearfix')
pat_s = re.compile('<em>(.*?)</em>')
purpose = re.findall(pat_s,str(purposee))
purposes.append(purpose)
data['车型'] = title.strip()
data['综合评分'] = synthesis[0]
data['外观'] = info[0]
data['内饰'] = info[1]
data['空间'] = info[2]
data['舒适'] = info[3]
data['耗油'] = info[4]
data['动力'] = info[5]
data['操控'] = info[6]
data['性价比'] = info[7]
return data
except BaseException as f:
print('获取数据失败,错误信息为:',f)

def main(urls,num):
'''程序入口'''
try:
conn = MongoClient('127.0.0.1', 27017)
db = conn.cars
cars = db.cars
for url1 in urls:
num += 1
url = 'http://newcar.xcar.com.cn' + url1 + 'review.htm'
if requests.get(url).url != url:
print('该车型没有综合评分及购车用途',url)
else:
purposes = []
print('正在处理第{}个链接,URL:{}'.format(num,url))
pages = getpages(url)
data = getall(pages,url,purposes)
clearfix = {}
call = []
#将所有购车用途遍历,重新加入新的列表,进行统计出现次数
for a in purposes:
for b in a:
call.append(b)
for c in set(call):
clearfix[c] = call.count(c)
data['购车用途'] = clearfix
cars.insert(data)
print('成功保存{}条数据'.format(num))
except BaseException as f:
print('主函数运行出错了,错误信息为:',f)

if __name__ == '__main__':
url = "http://newcar.xcar.com.cn/price/"
html = gethtml(url)
urls = geturls(html)
monkey.patch_all()
jobs = []
num = 0
for x in range(190):
job = None
if x == 0:
job = gevent.spawn(main, urls[1:(x+1)*10],num)
elif 0 < x < 189:
job = gevent.spawn(main, urls[x*10+1:(x+1)*10],num)
else:
job = gevent.spawn(main, urls[x*10+1:1891],num)
jobs.append(job)
gevent.joinall(jobs)
print('----------全部保存成功----------')

第一次完成时单线程爬了5小时,全部爬完了。
第二次改为多线程处理,半小时就完成了。