import requests, time, random
from fake_useragent import UserAgent
from lxml.html import etree, HTMLParserfrom requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning
from multiprocessing import Pool# 关闭Https请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)
ua = UserAgent()number = 1
new_session_xin = 'k8935l0tr72p6dfngdfnuiukoo4n6jfn'
anti_uid = '8F932282-2E08-FA10-DDDC-841EEF3E0BF3'

def get_proxy():response = requests.get('http://localhost:5010/get/').textproxy = {'http': 'http://' + response}return proxydef get_session_xin():global anti_uidheaders = {'User-Agent': ua.random,
        'Host': 'www.xin.com',
        'Referer': 'https://www.xin.com/zhengzhou/baoma/',
        'Cookie': 'XIN_bhv_oc=1233; XIN_anti_uid={}; XIN_LOCATION_CITY=%7B%22cityid%22%3A%221001%22%2C%22areaid%22%3A%224%22%2C%22big_areaid%22%3A%222%22%2C%22provinceid%22%3A%2210%22%2C%22cityname%22%3A%22%5Cu90d1%5Cu5dde%22%2C%22ename%22%3A%22zhengzhou%22%2C%22shortname%22%3A%22ZN%22%2C%22service%22%3A%221%22%2C%22near%22%3A%22201%2C501%2C2101%2C2117%2C1010%2C1002%2C601%2C2401%2C901%2C1201%22%2C%22tianrun_code%22%3A%220371%22%2C%22zhigou%22%3A%221%22%2C%22longitude%22%3A%22113.6253680%22%2C%22latitude%22%3A%2234.7465990%22%2C%22direct_rent_support%22%3A%221%22%2C%22salvaged_support%22%3A%221%22%2C%22isshow_c%22%3A%221%22%7D; uid=rBAKEls5vG1giwDiR4LWAg==; NSC_20.eqppmxfc.yjo.dpn=ffffffffaf18140345525d5f4f58455e445a4a423660; XIN_UID_CK=5e21beea-146c-a405-2a32-2df07fc0eac9'.format(anti_uid)}response = requests.get('https://www.xin.com/search/get_wishlist_token', headers=headers, proxies=get_proxy(), verify=False)# 从响应头的Set-Cookie中,取出session_xin
    session_xin = response.cookies.get('session_xin', '没有')print(session_xin)return session_xindef get_list_page(page_num):global number, new_session_xin, anti_uidtm = str(time.time()).split('.')[0]url = 'https://www.xin.com/zhengzhou/baoma/i{}'.format(page_num)headers = {'User-Agent': ua.random,
        'Host': 'www.xin.com',
        'Referer': 'https://www.xin.com/zhengzhou/baoma/',
        'Cookie': 'RELEASE_KEY=; XIN_bhv_oc=1233; XIN_anti_uid={}; XIN_LOCATION_CITY=%7B%22cityid%22%3A%221001%22%2C%22areaid%22%3A%224%22%2C%22big_areaid%22%3A%222%22%2C%22provinceid%22%3A%2210%22%2C%22cityname%22%3A%22%5Cu90d1%5Cu5dde%22%2C%22ename%22%3A%22zhengzhou%22%2C%22shortname%22%3A%22ZN%22%2C%22service%22%3A%221%22%2C%22near%22%3A%22201%2C501%2C2101%2C2117%2C1010%2C1002%2C601%2C2401%2C901%2C1201%22%2C%22tianrun_code%22%3A%220371%22%2C%22zhigou%22%3A%221%22%2C%22longitude%22%3A%22113.6253680%22%2C%22latitude%22%3A%2234.7465990%22%2C%22direct_rent_support%22%3A%221%22%2C%22salvaged_support%22%3A%221%22%2C%22isshow_c%22%3A%221%22%7D; uid=rBAKEls5vG1giwDiR4LWAg==; NSC_20.eqppmxfc.yjo.dpn=ffffffffaf18140345525d5f4f58455e445a4a423660; XIN_UID_CK=5e21beea-146c-a405-2a32-2df07fc0eac9; Hm_lvt_ae57612a280420ca44598b857c8a9712=1530510447; Hm_lpvt_ae57612a280420ca44598b857c8a9712={}; session_xin={}; SEO_REF=https://www.xin.com/zhengzhou/baoma/'.format(anti_uid, tm, new_session_xin)}response = requests.get(url, headers=headers, verify=False, proxies=get_proxy())uid = response.cookies.get('XIN_anti_uid', '')if uid:print('uid = ',uid)anti_uid = uidelse:print('uid 不存在')return response.textdef parse_list_page(list_page):list_pool = Pool(4)list_obj = etree.HTML(list_page, parser=HTMLParser(encoding='utf-8'))detail_urls = list_obj.cssselect('h2 .tit')for detail_url in detail_urls:detail_url = 'https:' + detail_url.attrib['href']list_pool.apply_async(get_detail_page, args=(detail_url,), callback=parse_detail_page)list_pool.close()list_pool.join()def get_detail_page(detail_url):global number, new_session_xin, anti_uidnumber_list = [1525 + number, 1319 + number, 1262 + number, 1436 + number, 1561 + number, 1452 + number,
                   1618 + number, 1624 + number, 1632 + number, 1631 + number, 1646 + number, 1742 + number,
                   1814 + number, 1891 + number, 1847 + number, 2286 + number]tm = str(time.time()).split('.')[0]# 每次请求详情页数据之前,需要判断number的值,目的就是爬取详情页几条数据之后,更换session_xin的值
    if number % 9 == 0:number += 1
        new_session_xin = get_session_xin()get_detail_page(detail_url)# 默认情况下,get_detail_page()执行完毕,会继续向下执行代码
        return

    headers = {'User-Agent': ua.random,
        'Host': 'www.xin.com',
        'Referer': 'https://www.xin.com/zhengzhou/baoma/',
        'Cookie': 'RELEASE_KEY=; XIN_bhv_oc={}; XIN_anti_uid={}; XIN_LOCATION_CITY=%7B%22cityid%22%3A%221001%22%2C%22areaid%22%3A%224%22%2C%22big_areaid%22%3A%222%22%2C%22provinceid%22%3A%2210%22%2C%22cityname%22%3A%22%5Cu90d1%5Cu5dde%22%2C%22ename%22%3A%22zhengzhou%22%2C%22shortname%22%3A%22ZN%22%2C%22service%22%3A%221%22%2C%22near%22%3A%22201%2C501%2C2101%2C2117%2C1010%2C1002%2C601%2C2401%2C901%2C1201%22%2C%22tianrun_code%22%3A%220371%22%2C%22zhigou%22%3A%221%22%2C%22longitude%22%3A%22113.6253680%22%2C%22latitude%22%3A%2234.7465990%22%2C%22direct_rent_support%22%3A%221%22%2C%22salvaged_support%22%3A%221%22%2C%22isshow_c%22%3A%221%22%7D; uid=rBAKEls5vG1giwDiR4LWAg==; NSC_20.eqppmxfc.yjo.dpn=ffffffffaf18140345525d5f4f58455e445a4a423660; XIN_UID_CK=5e21beea-146c-a405-2a32-2df07fc0eac9; Hm_lvt_ae57612a280420ca44598b857c8a9712=1530510447; Hm_lpvt_ae57612a280420ca44598b857c8a9712={}; session_xin={}; SEO_REF=https://www.xin.com/zhengzhou/baoma/; XIN_CARBROWSE_IDS=%5B67720293%5D; XIN_bhv_pc={}; XIN_bhv_expires=1530597119591'.format(anti_uid, random.choice(number_list), tm, new_session_xin, number)}response = requests.get(detail_url, headers=headers, verify=False, proxies=get_proxy())return response.text, detail_urldef parse_detail_page(detail_tuple):global numberdetail_page = detail_tuple[0]detail_url = detail_tuple[1]detail_obj = etree.HTML(detail_page, parser=HTMLParser(encoding='utf-8'))try:title = detail_obj.xpath('//span[@class="cd_m_h_tit"]//text()')[3]except Exception as e:title = detail_obj.xpath('//span[@class="cd_m_h_tit"]//text()')[0].strip()price = detail_obj.xpath('//span[@class="cd_m_info_jg"]/b/text()')[0].strip()print(detail_url, title, price)number += 1

if __name__ == '__main__':pool = Pool(4)for x in range(1, 51):print('开始获取第{}页...'.format(x))pool.apply_async(get_list_page, args=(x,), callback=parse_list_page)pool.close()pool.join()

天眼

import requests,time
from lxml.html import etree
from fake_useragent import UserAgent
from urllib.parse import quotefrom requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning
from multiprocessing import Pool
# from fontTools.ttLib import TTFont

# 关闭Https请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)ua = UserAgent()
number_dict1 = {'0': '9',
    '1': '2',
    '2': '1',
    '3': '4',
    '4': '7',
    '5': '8',
    '6': '3',
    '7': '5',
    '8': '6',
    '9': '0',
    '-': '-'
}# 8935: 6048
# 8936: 6043
# 8936-94-90: 2017-08-09
# 8936-95-84: 2017-06-28

number_dict2 = {'0': '9',
    '1': '4',
    '2': '5',
    '3': '1',
    '4': '8',
    '5': '6',
    '6': '7',
    '7': '3',
    '8': '2',
    '9': '0',
    '-': '-'
}KEY_WORD = '智游'

# response = requests.get('https://static.tianyancha.com/fonts-styles/fonts/49/49631975/tyc-num.woff').text

def get_proxy():response = requests.get('http://localhost:5010/get/').textproxy = {'http': 'http://' + response}return proxydef get_list_page(page_num):tm = str(time.time()).split('.')[0]headers = {'User-Agent': ua.random,
        'Host': 'www.tianyancha.com',
        'Cookie': 'TYCID=2b902090793a11e8bbf42fcb3431841d; undefined=2b902090793a11e8bbf42fcb3431841d; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1530015137,1530061830,1530104465,1530519246; ssuid=4009891320; aliyungf_tc=AQAAAP+boVulnQoAg6cPqxTilju98D0f; csrfToken=yg6QXuv2Dch1Abfr-giP-AH4; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758={}; RTYCID=24412db0b3da41c5be4439ba8f942ce8; bannerFlag=true; token=1675836c554a48fe9bcc18cfc45cb4d0; _utm=788b0bb711164fda9a5e6b1964bb5bf9; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252213037677318%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA'.format(tm)}list_url = 'https://www.tianyancha.com/search/p{}?key={}'.format(page_num, quote(KEY_WORD))response = requests.get(list_url, headers=headers, verify=False, proxies=get_proxy())return response.textdef parse_list_page(list_page):list_html = etree.HTML(list_page, parser=etree.HTMLParser(encoding='utf-8'))divs = list_html.cssselect('.search_row_new')all_a = list_html.cssselect('.query_name')detail_pool = Pool(4)for x in range(len(divs)):div = divs[x]detail_url = all_a[x].attrib['href']try:person = div.cssselect('.legalPersonName')[0].textzhuceziben = div.xpath('.//span[contains(@title, "人民币")]/text()')[0]except Exception:continue
        else:zhuceshijian = div.xpath('.//span[contains(@title, "-")]/text()')[0]detail_pool.apply_async(get_detail_page, args=(detail_url, person, zhuceziben, zhuceshijian), callback=parse_detail_page)detail_pool.close()detail_pool.join()def get_detail_page(detail_url, person, zhuceziben, zhuceshijian):tm = str(time.time()).split('.')[0]headers = {'User-Agent': ua.random,
        'Host': 'www.tianyancha.com',
        'Cookie': 'TYCID=2b902090793a11e8bbf42fcb3431841d; undefined=2b902090793a11e8bbf42fcb3431841d; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1530015137,1530061830,1530104465,1530519246; ssuid=4009891320; aliyungf_tc=AQAAAP+boVulnQoAg6cPqxTilju98D0f; csrfToken=yg6QXuv2Dch1Abfr-giP-AH4; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758={}; RTYCID=24412db0b3da41c5be4439ba8f942ce8; bannerFlag=true; token=1675836c554a48fe9bcc18cfc45cb4d0; _utm=788b0bb711164fda9a5e6b1964bb5bf9; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252213037677318%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA'.format(tm)}response = requests.get(detail_url, headers=headers, proxies=get_proxy())return response.text, detail_url, person, zhuceziben, zhuceshijiandef parse_detail_page(detail_tuple):detail_html = detail_tuple[0]detail_url, person, zhuceziben, zhuceshijian = detail_tuple[1], detail_tuple[2], detail_tuple[3], detail_tuple[4]detail_obj = etree.HTML(detail_html, parser=etree.HTMLParser(encoding='utf-8'))res_str = ''
    try:date_str = detail_obj.cssselect('.base0910 .tyc-num')[0].textexcept Exception:pass
    else:for res in date_str:res_str += number_dict1[res]# 如果number_dict1第一套规则匹配失败,尝试第二套规则转化。
        if res_str[0] != '2':res_str = ''
            for res in date_str:res_str += number_dict2[res]print(detail_url, person, zhuceziben, zhuceshijian, res_str)if __name__ == '__main__':pool = Pool(1)for x in range(1, 6):pool.apply_async(get_list_page, args=(x,), callback=parse_list_page)pool.close()pool.join()

爬虫实战项目--优信二手车--天眼相关推荐

  1. python爬虫---爬取优信二手车

    import requests import re import redis from lxml import etree import pymysql #获取城市数据 class City():de ...

  2. python 循环定时器 timer显示数据_【Python】多线程、定时循环爬取优信二手车信息...

    爬虫 爬取优信二手车:循环遍历每页,获取相应的有价值字段信息,这里不详细阐释了. 多线程 Python中,使用concurrent.futures模块下的ThreadPoolExecutor类来实现线 ...

  3. 仿优信二手车品牌选择demo

    不知道为啥,markdown编辑器不能用,一打开就奔溃,换个浏览器也不行,重启也不好使,估计是csdn那块出问题了吧:索性这会想写就干脆用普通编辑器写了,排版不好了大伙多担待.项目实现方法如果有更好的 ...

  4. php面试题 优信二手车_分布式定时任务对比 - sharedCode的个人空间 - OSCHINA - 中文开源技术交流社区...

    分布式定时任务 1. 什么是分布式定时任务 把分散的,可靠性差的计划任务纳入统一的平台,并实现集群管理调度和分布式部署的一种定时任务的管理方式.叫做分布式定时任务. 2. 常见开源方案 elastic ...

  5. java爬虫教程 百度云_java视频教程java爬虫实战项目httpclient hbase springmvc solr

    资源内容: java视频教程java爬虫实战项目httpclient hbase springmvc solr|____猫了个咪-更多IT精品课程.html|____猫了个咪--it视频论坛.url| ...

  6. 优信二手车:赛道虽好,生意难做

    配图来自Canva可画 高营销投入没能换来高回报,优信.瓜子二手车.人人车等二手车电商,集体陷入求生困境. 早前,人人车大规模裁员,破产.卖身.高层离职等传言层出不穷,曾经火爆大街小巷的人人车广告也逐 ...

  7. python爬虫项目-32个Python爬虫实战项目,满足你的项目慌

    原标题:32个Python爬虫实战项目,满足你的项目慌 爬虫项目名称及简介 一些项目名称涉及企业名词,小编用拼写代替 1.[WechatSogou]- weixin公众号爬虫.基于weixin公众号爬 ...

  8. python常用代码大全-Python 网络爬虫实战项目代码大全

    原标题:Python 网络爬虫实战项目代码大全 DouBanSpider 豆瓣读书的爬虫.你可以爬豆瓣读书下面标签下的所有图书,按评分排名依次存储,存储到Excel中,可方便大家筛选搜罗,比如筛选评价 ...

  9. 10个Python爬虫实战项目

    Python爬虫是指使用Python语言编写程序,自动化地从互联网上获取数据并进行处理和分析的技术.Python爬虫是一项复杂而且实用的技术,需要掌握多个方面的基础知识,并具备较强的编程能力和实际操作 ...

最新文章

  1. linux磁盘管理系列一:磁盘配额管理
  2. 向vSphere迁移虚拟机,Converter需要升级至4.0.1
  3. python怎么用excel-Python怎么写入excel文件?详细实例在这里。。。
  4. Mysql中查询速度的优化
  5. Spring Cache-缓存概述及使用
  6. javascript闭包学习
  7. 跨行合并居中html标记语言,[前端Web]HTML语法最全解-建议收藏
  8. 20200207_Dontla_MBTI第二步解释性报告(ISTJ(内向+感觉+思考+判断))
  9. Persistent Memory编程简介
  10. Loader之二:CursorLoader基本实例
  11. java生成首字母拼音简码的总结
  12. java agent_如何脚踏实地构建Java Agent
  13. LintCode 795. 4种独特的路径(DFS)
  14. 设计【SPFA】【差分约束】
  15. Python-将txt文件转换成Excel
  16. SAS学习7(gpolt过程、gchart过程、图形输出、ods功能、假设检验,ttest过程,rank过程,秩和检验(非参数检验))
  17. 华为设备配置Easy IP 地址转换
  18. 转录组分析_转录组分析的正确姿势
  19. Vue 使用three.js实现简单全景图
  20. JWT生成Token及解析Token

热门文章

  1. Windows、Linux、Apple三大操作系统的主流文件系统包含哪些?
  2. 初探Java设计模式5:一文了解Spring涉及到的9种设计模式
  3. RTL8211使用说明
  4. FGMap学习之--天气预报
  5. AI和区块链的融合会带来什么?
  6. 初遇电容-电容在电路中的作用
  7. 银行间本币市场上下行接口
  8. 如何为几乎所有电话添加无线充电
  9. Kaggle 新赛 | GoDaddy 小微企业密度预测
  10. 前端访问后端html,前端访问没问题 后端无法访问