python闲鱼二手爬虫_Python 爬虫咸鱼版

#encoding=utf-8

import re

import requests

import urllib2

import datetime

import MySQLdb

from bs4 import BeautifulSoup

import sys

reload(sys)

sys.setdefaultencoding("utf-8")

class Splider(object):

def __init__(self):

print u'开始爬取内容...'

##用来获取网页源代码

def getsource(self,url):

headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2652.0 Safari/537.36'}

req = urllib2.Request(url=url,headers=headers)

socket = urllib2.urlopen(req)

content = socket.read()

socket.close()

return content

##changepage用来生产不同页数的链接

def changepage(self,url,total_page):

now_page = int(re.search('page/(\d+)',url,re.S).group(1))

page_group = []

for i in range(now_page,total_page+1):

link = re.sub('page/(\d+)','page/%d' % i,url,re.S)

page_group.append(link)

return page_group

#获取字内容

def getchildrencon(self,child_url):

conobj = {}

content = self.getsource(child_url)

soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')

content = soup.find('div',{'class':'c-article_content'})

img = re.findall('src="(.*?)"',str(content),re.S)

conobj['con'] = content.get_text()

conobj['img'] = (';').join(img)

return conobj

##获取内容

def getcontent(self,html_doc):

soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')

tag = soup.find_all('div',{'class':'promo-feed-headline'})

info = {}

i = 0

for link in tag:

info[i] = {}

title_desc = link.find('h3')

info[i]['title'] = title_desc.get_text()

post_date = link.find('div',{'class':'post-date'})

pos_d = post_date['data-date'][0:10]

info[i]['content_time'] = pos_d

info[i]['source'] = 'whowhatwear'

source_link = link.find('a',href=re.compile(r"section=fashion-trends"))

source_url = 'http://www.whowhatwear.com'+source_link['href']

info[i]['source_url'] = source_url

in_content = self.getsource(source_url)

in_soup = BeautifulSoup(in_content, 'html.parser', from_encoding='utf-8')

soup_content = in_soup.find('section',{'class':'widgets-list-content'})

info[i]['content'] = soup_content.get_text().strip('\n')

text_con = in_soup.find('section',{'class':'text'})

summary = text_con.get_text().strip('\n') if text_con.text != None else NULL

info[i]['summary'] = summary[0:200]+'...';

img_list = re.findall('src="(.*?)"',str(soup_content),re.S)

info[i]['imgs'] = (';').join(img_list)

info[i]['create_time'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

i+=1

#print info

#exit()

return info

def saveinfo(self,content_info):

conn = MySQLdb.Connect(host='127.0.0.1',user='root',passwd='123456',port=3306,db='test',charset='utf8')

cursor = conn.cursor()

for each in content_info:

for k,v in each.items():

sql = "insert into t_fashion_spider2(`title`,`summary`,`content`,`content_time`,`imgs`,`source`,`source_url`,`create_time`) values ('%s','%s','%s','%s','%s','%s','%s','%s')" % (MySQLdb.escape_string(v['title']),MySQLdb.escape_string(v['summary']),MySQLdb.escape_string(v['content']),v['content_time'],v['imgs'],v['source'],v['source_url'],v['create_time'])

cursor.execute(sql)

conn.commit()

cursor.close()

conn.close()

if __name__ == '__main__':

classinfo = []

p_num = 5

url = 'http://www.whowhatwear.com/section/fashion-trends/page/1'

jikesplider = Splider()

all_links = jikesplider.changepage(url,p_num)

for link in all_links:

print u'正在处理页面：' + link

html = jikesplider.getsource(link)

info = jikesplider.getcontent(html)

classinfo.append(info)

jikesplider.saveinfo(classinfo)

python闲鱼二手爬虫_Python 爬虫咸鱼版相关推荐

python从入门到爬虫_python爬虫从入门到放弃（一）之初识爬虫
什么是爬虫? 网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本.另外一些不常使用的名字还有蚂蚁.自动索引. ...
python爬取收费素材_Python爬虫练习：爬取素材网站数据
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 在工作中的电子文案.ppt,生活中的新闻.广告,都离不开大量的素材,而素材 ...
python基础知识500题_python爬虫基础知识点整理
更多编程教程请到:菜鸟教程 https://www.piaodoo.com/ 友情链接: 高州阳光论坛https://www.hnthzk.com/ 人人影视http://www.sfkyty.com ...
python通过ip池爬_python 爬虫代理ip池（适合初学者）
初次学习python爬虫的朋友在频繁访问被爬取页面网站时都会被拦截,也就是限制ip.这里教教大家建立代理ip池. #!/usr/bin/env python3# -*- coding: utf-8 - ...
51自学网python爬虫_Python爬虫基本流程
爬虫定义爬虫是请求网站并提取自己所需要数据的过程.通过我们的程序,可以代替我们向服务器发送请求,然后进行批量的数据下载. 爬虫基本流程发起请求通过url向服务器发送requests请求,请求可以 ...
python闲鱼爬虫_Python 爬虫爬坑路（二）——B站图片，咸鱼的正确 GET 姿势
前言昨天在写完入门级爬虫之后 ,马上就迫不及待的着手开始写 B站的图片爬虫了,真的很喜欢这个破站呢 (〜￣△￣)〜这里不涉及到 Python 爬虫的高级技巧,没有使用框架,没有考虑反爬机制,没有 ...
知乎python练手的_Python—爬虫之初级实战项目：爬取知乎任一作者的文章练手
爬虫之初级实战项目:爬取知乎任一作者的文章练手在正式上代码之前,先过一遍之前所学知识的框架内容,温故而知新!!! 接下来我们直接上代码,一定要手敲代码.手敲代码.手敲代码!!! import req ...
python爬取酒店信息_Python 爬虫练手项目—酒店信息爬取
from bs4 import BeautifulSoup import requests import time import re url = 'http://search.qyer.com/ho ...
python xpath爬虫_Python爬虫（2）：XPath语法
OK,上次我们说到了网页爬虫的一个思路:首先说一下网页爬虫的整个思路方法:先爬取整个网页,也就是将网页的源代码给获取下来爬取下来的网页再通过文本解析提取,找到我们需要的信息,可以是图片或者文字然后 ...

python闲鱼二手爬虫_Python 爬虫咸鱼版

python闲鱼二手爬虫_Python 爬虫咸鱼版相关推荐

最新文章

热门文章