python爬虫---爬取优信二手车

import requests
import re
import redis
from lxml import etree
import pymysql
#获取城市数据
class City():def __init__(self):#初始化Redis连接self.r=self.get_redis()def __call__(self, *args, **kwargs):self.get_city()# 创建数据库，减少对网站的攻击def get_redis(self):return redis.Redis(host='127.0.0.1', port=6379, db=1)def get_city(self):# 初始的urlinterface_url="https://www.xin.com/apis/Ajax_common/get_home_city/"import jsonjson_data=str(requests.get(interface_url).json())print(json_data)print(type(json_data))#获取城市：city_name=re.findall("'ename': '(.*?)'",json_data)# print(city_name)#遍历城市，获取城市的urlfor city in city_name:city_url="https://www.xin.com/{}/s/?channel=a49b117c44837d110753e751863f53".format(city)# print('================正在下载{}=============='.format(city))# print(city_url)#将城市的链接写入Redis数据库self.r.rpush("city_url", city_url)def get_html(self,url):response=requests.get(url)html=response.text# print(html)return  html,etree.HTML(html)
# 获取所有车的品牌：
class AutomobileBrand(City):def __call__(self, *args, **kwargs):self.get_brand()def get_brand(self):city_url_all=self.r.lrange('city_url',0,-1)# print(city_url_all)for city_url in city_url_all:# 根据url获取所有的车的品牌：# print(city_url.decode('utf-8'))htnl,html_xml=self.get_html(city_url.decode('utf-8'))car_brand_name=html_xml.xpath('//ul//li[position()>1]/dl/dd/a/text()')car_brand_url=html_xml.xpath('//ul//li[position()>1]/dl/dd/a/@href')# print(car_brand_url)# print(car_brand_name)for index in range(len(car_brand_url)):brand_url='https:'+car_brand_url[index]brand_name=car_brand_name[index].strip()# print('===================开始下载{}========================'.format(brand_name))# print(brand_url,brand_name)#将数据存入Redis中# self.r.rpush('brand_url_name',brand_url,brand_name)# for car_url in car_brand_url:#     car_url='https:'+car_url#     print(car_url)# for car_name in car_brand_name:#     car_name=car_name.strip()#     print(car_name)# break
# 准备获取数据：
class Car(AutomobileBrand):def __call__(self, *args, **kwargs):self.count=1self.conn_mysql()self.get_system()def get_system(self):#读取数据库数据brand_url_name=self.r.lrange('brand_url_name',0,-1)# print(brand_url_name)for index in range(0,len(brand_url_name),2):brand_url=brand_url_name[index].decode('utf-8')# print(self.count,brand_url)# self.count+=1#获取品牌的页面：html,html_xml=self.get_html(brand_url)# #获取车系：car_system_name=html_xml.xpath('//div[@id="search_serial"]//ul//div/li/a/text()')car_sys_url=html_xml.xpath('//div[@id="search_serial"]//ul//div/li/a/@href')# print(car_system)print(car_sys_url)#遍历地址拼接/地址：for index in range(len(car_sys_url)):print('=============正在下载车系{}==============='.format(car_system_name[index]))car_sys='https:'+car_sys_url[index]print(car_sys)#根据车系获取最大的页面：self.get_page(car_sys)def get_page(self,car_sys):#获取最大页面：html,html_xml=self.get_html(car_sys)# max_page1 = html_xml.xpath('//div[@class="con-page search_page_link"]')max_page=html_xml.xpath('//div[@class="con-page search_page_link"]//a/@data-page')# print(max_page)# if not max_page1:try:if max_page==[]:if "小优还没有为您准备好车源" in html:passelse:max_page=["1","2"]# print(max_page)for page in range(1,int(max_page[-2])+1):# print(page)print('===================第{}页开始下载=================='.format(page))page_url=car_sys+"i{}/".format(page)# print(page_url)self.get_data(page_url)# breakexcept IndexError as e:print(e)finally:passdef get_data(self,page_url):html,html_xml=self.get_html(page_url)#获取每一页的汽车信息#首先缩小范围获取数据：li_list=html_xml.xpath('//div[@class="_list-con list-con clearfix ab_carlist"]/ul/li')# print(li_list)#遍历li_list标签：for li in li_list:#获取图片：car_pic=li.xpath('.//a/img/@src')# print(car_pic)# print(type(car_pic))if car_pic[0]  in '//s6.xinstatic.com/www/img/default.png':car_pic = li.xpath('.//a/img/@data-original')# print(car_pic)# print(type(car_pic))if 'https:' not in car_pic[0]:car_pic = "https:" + car_pic[0]else:car_pic=car_pic[0]if '_18' in car_pic:car_pic=car_pic.replace('_18','')# print(car_pic)#获取车名字：car_name=li.xpath('.//h2/span/text()')[0]# print(car_name)#获取年份car_year=li.xpath('.//div[@class="pad"]/span/text()')[0].strip().replace('年','')# print(car_year)#获取行驶的公里数car_km=li.xpath('.//div[@class="pad"]/span/text()')[1].strip()# print(car_km)# 获取仓库：car_house = li.xpath('.//div[@class="pad"]/span/span/text()')[0]# print(car_house)# 获取首付和月供：car__money_mouth = li.xpath('.//div[@class="pad"]/span[@class="pay-price"]/text()')car__money_mouth=','.join(car__money_mouth).replace('\n','').strip()#获取首付：car_first_money=car__money_mouth.split('                                                            ')[0]car_first_money=car_first_money if car_first_money else ''# print(car_first_money)# 获取月供：car_mouth_money=car__money_mouth.split('                                                            ')[1]car_mouth_money=car_mouth_money if car_mouth_money else ''# print(car_mouth_money)#获取总价钱：car_all_money=html_xml.xpath('.//em/text()')[1].strip().replace('\n','')car_price=re.findall('(.*?)万',car_all_money)[0].strip()car_price=car_price if car_price else ''# print(car_price)#根据每一个盒子获取到详情页面的连接：car_details=li.xpath('.//a/@href')[0]car_details="https:"+car_details# print(car_details)#创建一个字典：car_dict={'car_pic':car_pic,'car_name':car_name,'car_year':car_year,'car_km':car_km,'car_house':car_house,'car_first_money':car_first_money,'car_mouth_money':car_mouth_money,'car_price':car_price,'car_details':car_details,}# print(car_dict)self.insert_mysql(car_dict)def insert_mysql(self,car_dict):car_pic=car_dict['car_pic']car_name=car_dict['car_name']car_year=car_dict['car_year']car_km=car_dict['car_km']car_house=car_dict['car_house']car_first_money=car_dict['car_first_money']car_mouth_money=car_dict['car_mouth_money']car_price=car_dict['car_price']car_details=car_dict['car_details']#将数据插入数据库sql="""insert into youxin (car_pic,car_name,car_year,car_km,car_house,car_first_money,car_mouth_money,car_price,car_details)
VALUES('{}','{}',':{}','{}','{}','{}','{}','{}','{}')""".format(car_pic,car_name,car_year,car_km,car_house,car_first_money,car_mouth_money,car_price,car_details)# print(sql)try:#将数据保存到数据库self.cur.execute(sql)#提交数据self.conn.commit()print(self.count, sql)self.count += 1except Exception as e:print(e)self.conn.rollback()def conn_mysql(self):#创建连接数据库的对象：self.conn=pymysql.Connect(host='127.0.0.1',user='root',password='admin',database='02180530',charset='utf8')#创建操作数据库对象self.cur=self.conn.cursor()print(self.conn)# <pymysql.connections.Connection object at 0x0000000003789F60>表示连接成功
if __name__ == '__main__':# city=City()# city()# brand=AutomobileBrand()# brand()car=Car()car()

python爬虫---爬取优信二手车相关推荐

python 循环定时器 timer显示数据_【Python】多线程、定时循环爬取优信二手车信息...
爬虫爬取优信二手车:循环遍历每页,获取相应的有价值字段信息,这里不详细阐释了. 多线程 Python中,使用concurrent.futures模块下的ThreadPoolExecutor类来实现线 ...
基于requests库和lxml库爬取优信二手车
工具:lxml库和requests库 # _*_ coding:utf-8 _*_ import requests import re import time import MySQLdb impor ...
基于requests和lxml库爬取优信二手车
工具: requests库和lxml库具体代码: # _*_ coding:utf-8 _*_ import requests import re import time import MySQLd ...
scrapy爬取优信二手车
由于首页车辆分类是动态数据(解析动态页面),因此在页面源码找不到需要的数据,可以通过以下步骤找 car.py(爬虫文件) import scrapy import json from ..items ...
使用python爬虫爬取蓝调口琴网乐谱
学习目标:使用python爬虫爬取蓝调口琴网乐谱提示:这里可以添加学习目标例如:一周掌握 Java 入门知识学习内容: 使用爬虫爬取需要动态验证码(如手机短信验证码)登录的网站. 提示:这里可以 ...
在当当买了python怎么下载源代码-python爬虫爬取当当网
[实例简介]python爬虫爬取当当网 [实例截图] [核心代码] ''' Function: 当当网图书爬虫 Author: Charles 微信公众号: Charles的皮卡丘 ''' impor ...
python爬虫代码实例-Python爬虫爬取百度搜索内容代码实例
这篇文章主要介绍了Python爬虫爬取百度搜索内容代码实例,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下搜索引擎用的很频繁,现在利用Python爬 ...
python爬虫数据分析可以做什么-python爬虫爬取的数据可以做什么
在Python中连接到多播服务器问题,怎么解决你把redirect关闭就可以了.在send时,加上参数allow_redirects=False 通常每个浏览器都会设置redirect的次数.如果re ...
python爬虫爬取csdn博客专家所有博客内容
python爬虫爬取csdn博客专家所有博客内容: 全部过程采取自动识别与抓取,抓取结果是将一个博主的所有文章存放在以其名字命名的文件内,代码如下 #coding:utf-8import urlli ...

python爬虫---爬取优信二手车

python爬虫---爬取优信二手车相关推荐

最新文章

热门文章