1.爬虫 dmoz_spider.py

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:donghui

import scrapy
import re
from urllib.parse import quote
from tutorial.items import DmozItemclass DmozSpider(scrapy.Spider):name = "dmoz"
    allowed_domains = ["dmoz.org"]start_urls = ["http://www.btkuai.org/word/" + quote("风光") + "_{}.html".format(n) for n in range(1,10)]def savefile(self,filename,var):f = open("tutorial/res/"+filename+".csv","w+")  #路径一定要写对
        f.write(var)f.close()#print("保存完毕")

    def parse(self, response):url_head = 'http://www.btkuai.org'
        #filename = response.url.split("/")[-2]

        selector = response.xpath('//div[@id="container"]/div/ul/li/div[@class="T1"]')for sel in selector:title = sel.xpath('a/text()').extract()[0]link = url_head +(sel.xpath('a/@href').extract()[0])if re.findall('([a-zA-z]+://[^\s]*html$)',link,re.S):#print(title, link)
                #self.savefile(filename, title + "," + link)
                item = DmozItem()item['title'] = titleitem['link'] = linkyield item

2. Items items.py

import scrapyclass DmozItem(scrapy.Item):# define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()link = scrapy.Field()

3. 代理IP middlewares.py

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:donghui

# 导入随机模块
import random,json
# 导入settings文件中的IPPOOL
from .settings import IPPOOL
# 导入官方文档对应的HttpProxyMiddleware
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
from scrapy.conf import settingsclass IPPOOlS(HttpProxyMiddleware):with open("../EffectiveIp.json", 'r') as handler:ips = json.load(handler)handler.close()def __init__(self, ip=''):self.ip = ipdef process_request(self, request, spider):proxyMode = settings['IPPoolMode']if proxyMode==0:thisip = random.choice(settings['IPPOOL'])print("代理ip%s" % thisip["http"])request.meta["proxy"] = "http://" + thisip["http"]elif proxyMode==1:thisip = random.choice(IPPOOlS.ips)print("代理ip%s" % thisip["http"])request.meta["proxy"] = "http://" + thisip["http"]

4.代理商 uamid.py

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:donghui

# 导入随机模块
import random
# 导入settings文件中的UPPOOL
from .settings import UPPOOL
# 导入官方文档对应的HttpProxyMiddleware
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddlewareclass Uamid(UserAgentMiddleware):# 初始化 注意一定要user_agent,不然容易报错
    def __init__(self, user_agent=''):self.user_agent = user_agent# 请求处理
    def process_request(self, request, spider):# 先随机选择一个用户代理
        thisua = random.choice(UPPOOL)print("当前使用User-Agent是:"+ thisua)request.headers.setdefault('User-Agent', thisua)
5.settings.py
# -*- coding: utf-8 -*-

# Scrapy settings for tutorial project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'tutorial'

SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tutorial (+http://www.btkuai.org)'
USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# 配置mongoDB
MONGO_HOST = "127.0.0.1"  # 主机IP
MONGO_PORT = 27017  # 端口号
MONGO_DB = "btKuai"  # 库名
MONGO_COLL = "fengguang"  # collection

#0 从配置文件拿 1 从代理json文件拿
IPPoolMode=1

# 设置IPIPPOOL = [{"http": "125.32.250.240:8060"},{"http": "183.159.93.165:61234"},{"http": "119.49.33.238:8060"},{"http": "119.187.120.118:8060"},{"http": "120.25.203.182:7777"},{"http": "121.17.18.219:8060"},{"http": "123.8.41.163:8060"},{"http": "119.41.236.180:8010"},{"http": "121.17.18.218:8060"},{"http": "114.55.0.166:8090"},{"http": "118.122.105.99:9000"},{"http": "45.115.39.139:7777"}]
# 设置用户代理池
UPPOOL = [
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
]#  禁止本地Cookie
COOKIES_ENABLED = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'tutorial.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {#'tutorial.middlewares.MyCustomDownloaderMiddleware': 543,
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123,'tutorial.middlewares.IPPOOlS' : 125,'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 2,'tutorial.uamid.Uamid': 1
}# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {#'tutorial.pipelines.SomePipeline': 300,
    'tutorial.pipelines.BtKuaiMongo': 300,'tutorial.pipelines.JsonWritePipline': 300
}# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

python 代理爬种子,保存到MongoDB和json相关推荐

  1. mysql 查询 系统字段 自然日_吴裕雄--天生自然python数据清洗与数据可视化:MYSQL、MongoDB数据库连接与查询、爬取天猫连衣裙数据保存到MongoDB...

    本博文使用的数据库是MySQL和MongoDB数据库.安装MySQL可以参照我的这篇博文:https://www.cnblogs.com/tszr/p/12112777.html 其中操作Mysql使 ...

  2. 实现多线程爬取数据并保存到mongodb

    多线程爬取二手房网页并将数据保存到mongodb的代码: import pymongo import threading import timefrom lxml import etree impor ...

  3. Scrapy框架学习 - 爬取豆瓣电影排行榜TOP250所有电影信息并保存到MongoDB数据库中

    概述 利用Scrapy爬取豆瓣电影Top250排行榜电影信息,并保存到MongoDB数据库中 使用pymongo库操作MOngodb数据库 没有进行数据清洗 源码 items.py class Dou ...

  4. 爬取链家网二手房数据并保存到mongodb中

    提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档 爬取链家网二手房数据并保存到mongodb中 文章目录 前言 一.爬虫的介绍 二.协程的介绍 三.css选择器 四.基于asyncio ...

  5. selenium+chromedriver爬取淘宝美食信息保存到MongoDB

    配置文件 MONGO_URL = 'localhost' MONGO_DB = 'taobao_food' MONGO_TABLE = 'products' 代码实现 from selenium im ...

  6. 11.Scrapy框架基础-使用Scrapy抓取数据并保存到mongodb

    目录 一.Scrapy安装 1.mac系统 2.windows系统 二.使用scrapy爬取数据 1.新建一个scrapy工程 2.在spiders下新建一个爬虫文件 3.提取网页数据 三.保存数据到 ...

  7. Pandas的学习(读取mongodb数据库集合到DataFrame,将DataFrame类型数据保存到mongodb数据库中)

    1.读取mongodb数据库集合到DataFrame import pymongo import pandas as pdclient = pymongo.MongoClient("数据库连 ...

  8. 5分钟掌握智联招聘网站爬取并保存到MongoDB数据库

    前言 本次主题分两篇文章来介绍: 一.数据采集 二.数据分析 第一篇先来介绍数据采集,即用python爬取网站数据. 1 运行环境和python库 先说下运行环境: python3.5 windows ...

  9. 爬取前尘无忧python职位信息并保存到mongo数据库

    1.re实现 1 import re,os 2 import requests 3 from requests.exceptions import RequestException 4 5 MAX_P ...

最新文章

  1. VS2010 + Qt5.3.2配置教程
  2. 【Git入门之一】Git是神马?
  3. 跟我一起学.NetCore之EF Core 实战入门,一看就会
  4. 关于Spring 任务调度之task:scheduler与task:executor配置的详解
  5. 快餐店之间插入仓库,路最短,DP,POJ(1485)
  6. 【OpenCV】图像金字塔
  7. postgresql 导出单张表
  8. 【转载】win10环境下,利用自带虚拟机hyper-v安装centos7方法详解
  9. 输入两个正整数num1、num2,计算并输出它们的和、差、积、整数商和余数
  10. html输入时提示文字消失,html中input文字框,初始里边有文字提示。当点选时,文字消失,怎么改?...
  11. 基于 React Native 的 58 同城 App 开发实践
  12. [益智]:3个女儿的年龄
  13. 马斯克的这波神操作,让我意识到保持写代码的能力有多重要
  14. C++边边角角(一)
  15. 浙江计算机二本学校有哪些,浙江二本学校信息
  16. hive中出现rg.apache.hadoop.hive.ql.exec.mr.MapRedTask错误
  17. Python(序列化json,pickle,shelve)
  18. 建设智慧路灯有哪些方面的优势
  19. jQuery--事件、效果
  20. App Inventor菜鸟学习日记 2013/12/6

热门文章

  1. Popover动态修改内容
  2. Linux ssh多个端口开启
  3. vue data 中的 return 用法
  4. django中的queryset合并
  5. scope的生命周期
  6. 用户账户里面没有计算机管理员,我电脑用户账户受限,电脑管理员用户不见了...
  7. Android逆向实战 - 腾讯新闻去开屏广告
  8. csdn学后感言 王安琪
  9. nodejs原型链污染
  10. 用Python的第三方库抓取某点评网站的评论数据