抓取kaggle网站代码

github:  https://github.com/Gscsd8527/AllProject/blob/master/Kaggle/new_kaggle.py

import requests
import re
import json
import uuid
import datetimeBASE_URL = 'https://www.kaggle.com'HEADERS = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
Session = requests.session()
# 最多给显示1万的数据,所以写个10020
DATASETS = 10020def getToken():headers = HEADERS.copy()headers['upgrade-insecure-requests'] = '1'url = 'https://www.kaggle.com/datasets'response = Session.get(url, headers=headers)token = ''if response.status_code == 200:text = response.texttoken = re.findall("antiForgeryToken: '(.*?)',", text)[0]else:print('错误响应码为: ', response.status_code)return token# 获取每一页请求的data数据
def getEveryPageData(token):# data_list = []pages = DATASETS // 20ys = DATASETS % 20if ys > 1:pages += 1for num in range(1, pages+1):# for num in range(1, DATASETS+1):print('总共有 {}  页,这是第  {}  条数据'.format(pages, num))data = {"page": num,"group": "public","size": "all","fileType": "all","license": "all","viewed": "all","categoryIds": [],"search": "",# "sortBy": "hottest",# "sortBy": "votes",# "sortBy": "published",# "sortBy": "updated","sortBy": "usability","userId": None,"organizationId": None,"maintainerOrganizationId": None,"minSize": None,"maxSize": None,"isUserQuery": 'false'}requestEveryPageData(token, data)#     data_list.append(data)# return data_list# 将数组大小对齐
def myadd(lst1, lst2):if len(lst2) < len(lst1):my_len = len(lst1) - len(lst2)for i in range(my_len):lst2.append(' ')return lst2# 解析文件带后缀
def parse1(dt, filenames, filesizes, about_this_files, Columns):try:filename = re.findall('"relativePath":"(.*?)",', dt)# 提取文件名和文件大小for na in filename:try:name = na.split('/')[-1]filenames.append(name)size = re.findall('%s.*?"rowCount":(\d+)' % name, dt)if len(size):filesizes.append(size[0])else:filesizes.append(' ')except:pass# 提取关于文件的介绍for name in filenames:try:about_this_file = re.findall('"name":"%s","description":"(.*?)"}' % name, dt)if len(about_this_file):about_file = about_this_file[0].replace('\n', '').replace('\\', '').replace('u0022', '').replace('u0026gt;', '').replace('u0026lt;', '').replace('u0027', '')about_this_files.append(about_file)else:about_this_files.append(' ')except:pass# 提取columns,这个columns在文件名上方for name in filenames:filename_len = len(filenames)index = filenames.index(name)# 判断是否不是最后一个文件column = ''if index < (filename_len-1):# 从出现这个文件开始到下个文件开始结束,遇到带后缀的就还能匹配try:col_compile = re.compile('%s.*?"columns":.*?%s' % (name, name))data1 = re.findall(col_compile, dt)[0]column = re.findall('"name":"(.*?)","description":("\w+.*?"|null)', data1)except:passelif filenames[-1] == name:try:col_compile = re.compile('%s.*?"columns":.*?%s' % (name, name))data1 = re.findall(col_compile, dt)[0]column = re.findall('"name":"(.*?)","description":("\w+.*?"|null)', data1)except:passcolumns = []dct = {}if len(column):# temp = []for i in column:k, v = i[0].strip(' '), i[1].strip(' ').strip(r'\"')dct[k] = vfor k, v in dct.items():if len(k):if len(k) > 300:k = ''if len(v) > 500:v = ''# temp.append(k)if k not in filenames:columns.append([k, v])Columns.append(columns)else:Columns.append(['',''])del dctexcept:passreturn filenames, filesizes, about_this_files, Columns# 解析文件不带后缀
def parse2(dt, filenames, filesizes, about_this_files, Columns):try:data = re.findall('],"name":"(.*?)","description":"(.*?)"', dt)filenames = []filesizes = []about_this_files = []Columns = []# 得到文件名和文件描述for i in data:filter_str = 'Context'if filter_str not in i[1]:k = i[0].strip(' ')if len(k) > 300:k = ' 'v = i[1].strip(' ')if len(v) > 500:v = ' 'filenames.append(k)about_this_files.append(v)# 得到文件大小for name in filenames:size = re.findall('%s.*?"rowCount":(\d+)' % name, dt)if len(size):filesizes.append(size[0])else:filesizes.append(' ')# 得到columnfor name in filenames:index = filenames.index(name)if index == 0:col_compile = re.compile('"columns":.*?%s' % name)data1 = re.findall(col_compile, dt)[0]column = re.findall('"name":"(.*?)","description":("\w+.*?"|null|"")', data1)else:col_compile = re.compile('%s.*?"columns":.*?%s' % (filenames[index - 1], name))data1 = re.findall(col_compile, dt)[0]column = re.findall('"name":"(.*?)","description":("\w+.*?"|null|".*?")', data1)columns = []dct = {}if len(column):# temp = []for i in column:k, v = i[0].strip(' '), i[1].strip(' ').strip(r'\"')dct[k] = vfor k, v in dct.items():if len(k):if len(k) > 300:k = ''if len(v) > 500:v = ''# temp.append(k)if k not in filenames:columns.append([k, v])Columns.append(columns)else:Columns.append(['', ''])del dct# if len(column):#     for i in column:#         k = i[0].strip(' ')#         if len(k) > 300:#             k = ' '#         v = i[1].strip(' ')#         if len(v) > 500:#             v = ' '#         if k not in filenames:#             columns.append([k, v])#     Columns.append(columns)# else:#     Columns.append([['', ''],])except:passreturn filenames, filesizes, about_this_files, Columnsdef parseData(dt):# 提取文件名filename = re.findall('"relativePath":"(.*?)",', dt)# 根据filename的值来判断两套解析规则# 文件名列表filenames = []# 文件大小列表filesizes = []# 关于文件解释列表about_this_files = []# 每个文件对应的columns列表Columns = []if filename:filenames, filesizes, about_this_files, Columns = parse1(dt, filenames, filesizes, about_this_files, Columns)else:filenames, filesizes, about_this_files, Columns = parse2(dt, filenames, filesizes, about_this_files, Columns)filesizes = myadd(filenames, filesizes)about_this_files = myadd(filenames, about_this_files)Columns = myadd(filenames, Columns)a = zip(filenames, filesizes, about_this_files, Columns)file_data = {}index = 1for i in a:file_name = i[0]file_size = i[1]file_about = i[2]file_column = i[3]file_type = ''if '.' in file_name:file_type = file_name.rsplit('.', 1)[-1]temp_dict = {index: {'fileName': file_name,'fileSize': file_size,'fileType': file_type,'aboutThisFile': file_about,'columns': file_column}}file_data.update(temp_dict)index += 1return file_datadef parseUrl(url):headers = HEADERS.copy()temp_headers = {'accept': 'text/html, application/xhtml+xml','Referer': 'https://www.kaggle.com/datasets','Turbolinks-Referrer': 'https://www.kaggle.com/datasets'}headers.update(temp_headers)response = requests.get(url, headers=headers)text = response.text# re_compile = re.compile(r'<script type="application/ld\+json">(.*?)</script>')re_compile = re.compile(r'<script.*?type="application/ld\+json">(.*?)</script>')description_str = re.findall(re_compile, text)[0]# re_compile = re.compile(r'<div data-component-name="DatasetContainer".*?<script>(.*?)</script>')re_compile = re.compile(r'<div data-component-name="DatasetContainer".*?<script.*?>(.*?)</script>')data = re.findall(re_compile, text)[0]dt_compile = re.compile('push\((.*)}')dt = re.findall(dt_compile, data)[0]return description_str, dt# 请求每一页的数据
def requestEveryPageData(token, data):url = 'https://www.kaggle.com/requests/SearchDatasetsRequest'headers = HEADERS.copy()temp_headers = {'__requestverificationtoken': token,'accept': 'application/json','content-type': 'application/json','Referer': 'https://www.kaggle.com/datasets','x-xsrf-token': token}headers.update(temp_headers)# for data in data_list:response = Session.post(url, data=json.dumps(data), headers=headers)if response.status_code == 200:text = response.textdata_json = json.loads(text)items = data_json['result']['items']for item in items:try:# 1. 数据集标识uid = uuid.uuid1()suid = str(uid).replace('-', '')datasetId = suiddoi = ''handle = ''title = item['title']print('name= ', title)datasetOwner = item['ownerName']datasetOwnerIconPath = item['ownerAvatarUrl']overview = item['overview']downloadUrl = ''try:downloadUrl = BASE_URL + item['downloadUrl']except:print('downloadUrl error')source = BASE_URL + item['datasetUrl']lastUpdated = item['dateUpdated']dateCreated = item['dateCreated']license = item['licenseName']visibility = 'Public'datasetSize = ''try:datasetSize = item['datasetSize']except:print('datasetSize error')currentVersion = item['currentDatasetVersionNumber']expectedUpdateFrequency = ''try:expectedUpdateFrequency_score = item['usabilityRating']['updateFrequencyScore']if expectedUpdateFrequency_score == 0:expectedUpdateFrequency = '0: Not specified'except Exception as e:print('expectedUpdateFrequency error', e)tags = []categories = item['categories']['categories']for categorie in categories:tags.append(categorie['name'])collaborators = ''collaboratorIconPath = ''# 相似数据集similarDatasets = ''try:similar_url = source + '/suggestions.json'similar_headers = headers.copy()similar_headers['Referer'] = sourceresponse = requests.get(similar_url, similar_headers)if response.status_code == 200:json_data = json.loads(response.text)similarDatasets_dict = {}similar_index = 1for every in json_data:name = every['title']start_url = 'https://www.kaggle.com'name_url = start_url + every['entityUrl']thumbnailImageUrl = every['thumbnailImageUrl']temp_dict = {similar_index: {'name': name,'nameUrl': name_url,'nameImgUrl': thumbnailImageUrl}}similarDatasets_dict.update(temp_dict)similar_index += 1similarDatasets = similarDatasets_dict.copy()except:passdescription = ''description_json = ''ColumsData_str = ''try:description_str, ColumsData_str = parseUrl(source)description_json = json.loads(description_str)description = description_json['description']# data = parseData(ColumsData_str)except Exception as e:print('description error', e)spiderDateTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')# print('url=  ', source)Data = {'datasetId': datasetId,'doi': doi,'handle': handle,'name': title,'overview': overview,'tags': tags,'size': datasetSize,'description': description,'description_source': description_json,'data': '','data_source': ColumsData_str,'similarDatasets': similarDatasets,'license': license,'visibility': visibility,'provenance_source': '','datasetOwner': datasetOwner,'datasetOwnerIconPath': datasetOwnerIconPath,'collaborators': collaborators,'collaboratorIconPath': collaboratorIconPath,'expectedUpdateFrequency': expectedUpdateFrequency,'lastUpdated': lastUpdated,'dateCreated': dateCreated,'currentVersion': currentVersion,'path': '/data/www/html/dataset/Kaggle','source': source,'file_url': downloadUrl,'data_json': item,'spiderDateTime': spiderDateTime,}Data_json = json.dumps(Data)with open('KaggleData4.json', 'a+', encoding='utf-8') as f:f.write(Data_json)f.write('\n')except Exception as e:print('cuowu= ', e)with open('error.json', 'a+', encoding='utf-8') as f:f.write(BASE_URL + item['downloadUrl'])f.write('\n')else:print('错误响应码为---: ', response.status_code)def main():token = getToken()# data_list = getEveryPageData()# requestEveryPageData(token, data_list)getEveryPageData(token)if __name__ == '__main__':main()

Kaggle网站数据集抓取相关推荐

  1. 浅析搜索引擎对网站的抓取规则有哪些方面

    众多网络优化公司都有一个共识,那就是每个搜索引擎的排名实际上都是综合网站各方面因素进行综合判定的.在某种情况下凑巧让一些网站得出了一些所谓的优化结论,实际上关于搜索引擎的算法谁都不能确保自己完全了解, ...

  2. 百度SEO网站链接抓取器 v1.0

    简介: 网站链接抓取器可以帮助用户对网站上面的各个内容进行抓取,操作很简单,用户只需要输入网址域名就能够获取相应的源代码,从而获取网站上面的图片.脚本等等,适用于需要淘宝抓图或者仿站的用户,支持批量抓 ...

  3. 根据专利号到专利查询的网站上抓取想要的信息(上)

    前述:前几天看到有人论要请别人写一个从从网页上抓取某个专利号的收费信息的一个程序,说实话我自己知道那里面的原理是什么,但一直没有自己动手实现以下.根据自己的实际的工作需要一般是有一张Excel表,第一 ...

  4. WordPress快速增加百度收录,加快网站内容抓取

    本文已同步到专业技术网站 www.sufaith.com, 该网站专注于前后端开发技术与经验分享, 包含Web开发.Nodejs.Python.Linux.IT资讯等板块. 利用百度站长平台提供的链接 ...

  5. 免费网站数据抓取插件,可视化页面数据抓取插件

    网站页面数据抓取插件,允许我们将数据从网站直接抓取到我们的本地或者页面.网站网页数据抓取(也称为ScreenScraping.WebDataExtraction.WebHarvesting等)是一种用 ...

  6. 为何大量网站不能抓取?爬虫突破封禁的6种常见方法

    为何大量网站不能抓取?爬虫突破封禁的6种常见方法 在互联网上进行自动数据采集(抓取)这件事和互联网存在的时间差不多一样长.今天大众好像更倾向于用"网络数据采集",有时会把网络数据采 ...

  7. 根据专利号到专利查询的网站上抓取想要的信息(下)

    上一回讲了怎么根据一个专利号发送我们要查询信息的请求,详情请看根据专利号到专利查询的网站上抓取想要的信息(上).接下来要做的就是从一个Excel表中去读取我们要查的一系列的申请号,然后将抓到的信息写到 ...

  8. 百度爬虫:如何提高百度蜘蛛对网站的抓取量

    百度爬虫跟百度蜘蛛其实是一回事,百度蜘蛛在抓取网页信息后,会通过临时数据库进行处理,处理后的内容会被分门别类的收到索引库,等用户搜索相关关键词的时候才会在搜索结果页展现出来. 如何提高百度蜘蛛对网站的 ...

  9. 第十一章 从网站图片中抓取文字

    #!/usr/bin/env python # _*_ coding:utf-8 _*_ #这一节是数144页从网站图片抓取文字的代码, #需要安装tesseract,它是开源的可以通过训练识别出字体 ...

最新文章

  1. AutoScan-收集监视及办理器械
  2. 纯原生仿ES6的Object.assign,实现深度合并对象
  3. leetcode - 375. 猜数字大小 II
  4. 安装`lrzsz`包及其报错解决办法
  5. 机器视觉硬件选型——镜头选型
  6. JavaScript的闭包与应用
  7. MariaDB -- 数据类型
  8. wsdl2java工具生成客户端代码
  9. SuperCard与GBA
  10. gd32f303 设计中断优先级_RTOS内核的设计与实现
  11. java 事务管理 子父线程_java父线程子线程(转)
  12. 嵌入式linux音频播放器设计,基于嵌入式Linux下Madplay音频播放器设计论文.docx
  13. unity 多台 显示器 控制_设计专业显示器,哪些参数重要?明基PD2700U显示器给你答案...
  14. js控制右侧滚动条事件
  15. 应用ceph文件系统存储(ceph-13.2.10)
  16. 我的世界服务器背景音乐修改,我的世界怎么自定义背景音乐教程攻略
  17. 卡巴斯基实验室被攻陷后的四个未解之谜
  18. 圆周率100位可以这样速记
  19. [转载]用Java开发企业级无线应用
  20. 成型泡沫组件的全球与中国市场2022-2028年:技术、参与者、趋势、市场规模及占有率研究报告

热门文章

  1. find命令基本用法详解
  2. 缓存穿透、缓存击穿、缓存雪崩区别和解决方案
  3. 清华大学计算机相关视频教程汇总
  4. 施努卡:机器视觉系统的组成(视觉包括哪几部分)
  5. SQL注释怎么写以及SQL分类
  6. 2019 vs 安装odt_北京市2019二级造价师考试教材出版信息,免费送考试大纲
  7. vue项目搭建,启动
  8. 我们应该知道的java位运算
  9. jQuery.nivo.slider.js 幻灯片图片切换
  10. Linux查看文件大小的几种方法(超全)