麻辣烫的标题和对应的网址文件化
可直接运行,无需预先创建文件。
# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/7/30 12:57
# author:Sabo"""
改变titleTxtName和titleAndUrlTxtName即可在不同的文件下进行保存相关的标题和网址
"""import os
import requests
from bs4 import BeautifulSouptitleTxtName = "title.txt"
titleAndUrlTxtName = "titleAndUrl2.txt"
root = 'http://show.sctv.com/mlt/index'
urlTail = '.shtml'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36','Connection': 'close'}def getRootUrls(printFlag):rootUrls = []for i in range(0, 10):if i != 0:originUrl = root + '_' + i.__str__() + urlTailelse:originUrl = root + urlTailrootUrls.append(originUrl)if printFlag is True:print(rootUrls)return rootUrlsdef getLinksPerRootUrl(rootUrl, printFlag):response = requests.get(url = rootUrl)if response.status_code != 200:print('Get response error!')return ''else:response.encoding = 'utf-8'txt = response.textmainLink = BeautifulSoup(txt, 'html.parser')childLinks = []mainLink_txt = mainLink.find_all('div', attrs={"class": "txt"})for i in range(0, mainLink_txt.__len__()):link = mainLink_txt[i].find_next('a')href = link.get('href')childLinks.append(href)if printFlag == True:print(childLinks)return childLinksdef catUrl(catFlag, signalLinks):root = 'http://show.sctv.com/mlt'result = []for index in range(0, signalLinks.__len__()):if catFlag == 0:result.append(root+signalLinks[index][1:])else:result.append(root + catFlag.__str__() + signalLinks[index][1:])return resultdef urlTitles(rootUrl):titles = []response = requests.get(url=rootUrl)if response.status_code != 200:print('Get titles error!')return ''response.encoding ='utf-8'txt = response.textmainPage = BeautifulSoup(txt, 'html.parser')nameLinks = mainPage.find_all('div', attrs={'class':'name'})for index in range(0, nameLinks.__len__()):titles.append(nameLinks[index].text)return titlesdef checkTitleExist(dstTitle, checkTitleTxtName):titleList = []with open(checkTitleTxtName, mode="r") as f:for signal in f.readlines():titleList.append(signal.strip())if dstTitle in titleList:print("{0}中已经包含了{1}".format(checkTitleTxtName, dstTitle))f.close()return Truef.close()print("{0}中未包含{1}".format(checkTitleTxtName, dstTitle))return Falsedef appendTitle(dstTitle, appendTitleTxtName):print("正在向{0}中添加{1}".format(appendTitleTxtName, dstTitle))with open(appendTitleTxtName, mode="a") as f:f.write(dstTitle)f.write("\n")f.write("\n")f.close()def appendTitleAndUrl(title, dstUrl, TitleAndUrlTxtName):print("正在向{0}中添加\n{1} : {2}".format(TitleAndUrlTxtName, title, dstUrl))with open(TitleAndUrlTxtName, mode="a") as f:f.write("Title : {0}".format(title))f.write("\n")f.write("Url : {0}".format(dstUrl))f.write("\n")f.write("\n")f.close()def checkTxtExist(titleTxtName, titleAndUrlTxtName):fileLists = os.listdir()if titleTxtName not in fileLists:print("没有{0}文件,正在创建~".format(titleTxtName))open(titleTxtName, "w").close()if titleAndUrlTxtName not in fileLists:print("没有{0}文件,正在创建~".format(titleAndUrlTxtName))open(titleAndUrlTxtName, "w").close()def main():checkTxtExist(titleTxtName=titleTxtName, titleAndUrlTxtName=titleAndUrlTxtName)RootUrls = getRootUrls(printFlag = False)catFlag = 0for RootUrl in RootUrls:links = getLinksPerRootUrl(rootUrl=RootUrl, printFlag=False)titles = urlTitles(rootUrl=RootUrl)dstUrls = catUrl(catFlag = catFlag, signalLinks=links)for index in range(titles.__len__()):print("-"*50)if checkTitleExist(dstTitle=titles[index], checkTitleTxtName=titleTxtName) is False:appendTitle(dstTitle=titles[index], appendTitleTxtName=titleTxtName)appendTitleAndUrl(TitleAndUrlTxtName=titleAndUrlTxtName, title=titles[index], dstUrl=dstUrls[index])if __name__ == '__main__':main()
加入文件夹创建
# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/7/30 12:57
# author:Sabo"""
改变titleTxtName和titleAndUrlTxtName即可在不同的文件下进行保存相关的标题和网址
"""import os
import requests
from bs4 import BeautifulSoupdirName = "Sctv3Record"
titleTxtName = "title.txt"
titleAndUrlTxtName = "titleAndUrl2.txt"
root = 'http://show.sctv.com/mlt/index'
urlTail = '.shtml'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36','Connection': 'close'}def getRootUrls(printFlag):rootUrls = []for i in range(0, 10):if i != 0:originUrl = root + '_' + i.__str__() + urlTailelse:originUrl = root + urlTailrootUrls.append(originUrl)if printFlag is True:print(rootUrls)return rootUrlsdef getLinksPerRootUrl(rootUrl, printFlag):response = requests.get(url = rootUrl)if response.status_code != 200:print('Get response error!')return ''else:response.encoding = 'utf-8'txt = response.textmainLink = BeautifulSoup(txt, 'html.parser')childLinks = []mainLink_txt = mainLink.find_all('div', attrs={"class": "txt"})for i in range(0, mainLink_txt.__len__()):link = mainLink_txt[i].find_next('a')href = link.get('href')childLinks.append(href)if printFlag == True:print(childLinks)return childLinksdef catUrl(catFlag, signalLinks):root = 'http://show.sctv.com/mlt'result = []for index in range(0, signalLinks.__len__()):if catFlag == 0:result.append(root+signalLinks[index][1:])else:result.append(root + catFlag.__str__() + signalLinks[index][1:])return resultdef urlTitles(rootUrl):titles = []response = requests.get(url=rootUrl)if response.status_code != 200:print('Get titles error!')return ''response.encoding ='utf-8'txt = response.textmainPage = BeautifulSoup(txt, 'html.parser')nameLinks = mainPage.find_all('div', attrs={'class':'name'})for index in range(0, nameLinks.__len__()):titles.append(nameLinks[index].text)return titlesdef checkTitleExist(dstTitle, checkTitleTxtName):titleList = []with open(checkTitleTxtName, mode="r") as f:for signal in f.readlines():titleList.append(signal.strip())if dstTitle in titleList:print("{0}中已经包含了{1}".format(checkTitleTxtName, dstTitle))f.close()return Truef.close()print("{0}中未包含{1}".format(checkTitleTxtName, dstTitle))return Falsedef appendTitle(dstTitle, appendTitleTxtName):print("正在向{0}中添加{1}".format(appendTitleTxtName, dstTitle))with open(appendTitleTxtName, mode="a") as f:f.write(dstTitle)f.write("\n")f.write("\n")f.close()def appendTitleAndUrl(title, dstUrl, TitleAndUrlTxtName):print("正在向{0}中添加\n{1} : {2}".format(TitleAndUrlTxtName, title, dstUrl))with open(TitleAndUrlTxtName, mode="a") as f:f.write("Title : {0}".format(title))f.write("\n")f.write("Url : {0}".format(dstUrl))f.write("\n")f.write("\n")f.close()def checkTxtExist(titleTxtName, titleAndUrlTxtName):fileLists = os.listdir()if titleTxtName not in fileLists:print("没有{0}文件,正在创建~".format(titleTxtName))open(titleTxtName, "w").close()if titleAndUrlTxtName not in fileLists:print("没有{0}文件,正在创建~".format(titleAndUrlTxtName))open(titleAndUrlTxtName, "w").close()def checkDir(dirName):lists = os.listdir()if dirName not in lists:os.mkdir(dirName)def main():global titleTxtNameglobal titleAndUrlTxtNamecheckDir(dirName=dirName)titleTxtName=dirName+"/"+titleTxtNametitleAndUrlTxtName=dirName+"/"+titleAndUrlTxtNamecheckTxtExist(titleTxtName=titleTxtName, titleAndUrlTxtName=titleAndUrlTxtName)RootUrls = getRootUrls(printFlag = False)catFlag = 0for RootUrl in RootUrls:links = getLinksPerRootUrl(rootUrl=RootUrl, printFlag=False)titles = urlTitles(rootUrl=RootUrl)dstUrls = catUrl(catFlag = catFlag, signalLinks=links)for index in range(titles.__len__()):print("-"*50)if checkTitleExist(dstTitle=titles[index], checkTitleTxtName=titleTxtName) is False:appendTitle(dstTitle=titles[index], appendTitleTxtName=titleTxtName)appendTitleAndUrl(TitleAndUrlTxtName=titleAndUrlTxtName, title=titles[index], dstUrl=dstUrls[index])if __name__ == '__main__':main()
更正
# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/7/30 12:57
# author:Sabo"""
改变titleTxtName和titleAndUrlTxtName即可在不同的文件下进行保存相关的标题和网址
"""import os
import requests
from bs4 import BeautifulSoupdirName = "Sctv3Record"
titleTxtName = "title.txt"
titleAndUrlTxtName = "titleAndUrl2.txt"
root = 'http://show.sctv.com/mlt/index'
urlTail = '.shtml'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36','Connection': 'close'
}def getRootUrls(printFlag):rootUrls = []for i in range(0, 10):if i != 0:originUrl = root + '_' + i.__str__() + urlTailelse:originUrl = root + urlTailrootUrls.append(originUrl)if printFlag is True:print(rootUrls)return rootUrlsdef getLinksPerRootUrl(rootUrl, printFlag):response = requests.get(url=rootUrl, headers=headers)if response.status_code != 200:print('Get response error!')return ''else:response.encoding = 'utf-8'txt = response.textmainLink = BeautifulSoup(txt, 'html.parser')childLinks = []mainLink_txt = mainLink.find_all('div', attrs={"class": "txt"})for i in range(0, mainLink_txt.__len__()):link = mainLink_txt[i].find_next('a')href = link.get('href')childLinks.append(href)if printFlag == True:print(childLinks)return childLinksdef catUrl(catFlag, signalLinks):root = 'http://show.sctv.com/mlt'result = []for index in range(0, signalLinks.__len__()):if catFlag == 0:result.append(root + signalLinks[index][1:])else:result.append(root + catFlag.__str__() + signalLinks[index][1:])return resultdef urlTitles(rootUrl):titles = []response = requests.get(url=rootUrl, headers=headers)if response.status_code != 200:print('Get titles error!')return ''response.encoding = 'utf-8'txt = response.textmainPage = BeautifulSoup(txt, 'html.parser')nameLinks = mainPage.find_all('div', attrs={'class': 'name'})for index in range(0, nameLinks.__len__()):titles.append(nameLinks[index].text)return titlesdef checkTitleExist(dstTitle, checkTitleTxtName):titleList = []with open(checkTitleTxtName, mode="r") as f:for signal in f.readlines():titleList.append(signal.strip())if dstTitle in titleList:print("{0}中已经包含了{1}".format(checkTitleTxtName, dstTitle))f.close()return Truef.close()print("{0}中未包含{1}".format(checkTitleTxtName, dstTitle))return Falsedef appendTitle(dstTitle, appendTitleTxtName):print("正在向{0}中添加{1}".format(appendTitleTxtName, dstTitle))with open(appendTitleTxtName, mode="a") as f:f.write(dstTitle)f.write("\n")f.write("\n")f.close()def appendTitleAndUrl(title, dstUrl, TitleAndUrlTxtName):print("正在向{0}中添加\n{1} : {2}".format(TitleAndUrlTxtName, title, dstUrl))with open(TitleAndUrlTxtName, mode="a") as f:f.write("Title : {0}".format(title))f.write("\n")f.write("Url : {0}".format(dstUrl))f.write("\n")f.write("\n")f.close()def checkTxtExist(dirName, titleTxtName, titleAndUrlTxtName):fileLists = os.listdir(path=dirName)if titleTxtName not in fileLists:print("没有{0}文件,正在创建~".format(titleTxtName))open(titleTxtName, "w").close()if titleAndUrlTxtName not in fileLists:print("没有{0}文件,正在创建~".format(titleAndUrlTxtName))open(titleAndUrlTxtName, "w").close()def checkDir(dirName):lists = os.listdir()if dirName not in lists:os.mkdir(dirName)def main():global titleTxtNameglobal titleAndUrlTxtNamecheckDir(dirName=dirName)checkTxtExist(titleTxtName=titleTxtName, titleAndUrlTxtName=titleAndUrlTxtName, dirName=dirName)titleTxtName = dirName + "/" + titleTxtNametitleAndUrlTxtName = dirName + "/" + titleAndUrlTxtNameRootUrls = getRootUrls(printFlag=False)catFlag = 0for RootUrl in RootUrls:links = getLinksPerRootUrl(rootUrl=RootUrl, printFlag=False)titles = urlTitles(rootUrl=RootUrl)dstUrls = catUrl(catFlag=catFlag, signalLinks=links)for index in range(titles.__len__()):print("-" * 50)if checkTitleExist(dstTitle=titles[index], checkTitleTxtName=titleTxtName) is False:appendTitle(dstTitle=titles[index], appendTitleTxtName=titleTxtName)appendTitleAndUrl(TitleAndUrlTxtName=titleAndUrlTxtName, title=titles[index], dstUrl=dstUrls[index])if __name__ == '__main__':main()
麻辣烫的标题和对应的网址文件化相关推荐
- 根据文献标题免费下载PDF格式文件的文献内容
根据文献标题免费下载PDF格式文件的文献内容 通过文献名下载文献:较为通用,如有不能下载情况请留言. 第一步:根据文献名,查出文献的DOI. 进入https://www.crossref.org/ , ...
- dfmea文件_PFMEA执行之步骤七:结果文件化 vs 高层管理者的承诺
0.本章大纲 概览 PFMEA执行报告 高层管理者的承诺 vs 持续改进 PFMEA执行步骤七之输出物 常见不符合项 1.概览 结果文件化步骤的目的是,针对PFMEA活动的结果进行总结和交流 将过程结 ...
- wp博客链接.html,WordPress博客文章标题链接到自定义网址链接
WordPress 支持多种文章形式,相信不少博主遇到过需要把wordpress文章的标题链接到自定义的URL地址的情况,点击文章链接时不是直接访问文章详情页面,而是直接访问分享的链接网页,就像微博一 ...
- 百度搜索技巧,精确搜索,搜索指定标题、内容、网址,黑语法搜索入门
经常用百度的话,有时有想搜的内容但是却不能搜到,搜出一堆虽然有关键词但是根本不是你想要的内容. 因为百度有他的算法,大多数时候他能给你最想要的结果,但是有的时候,你想搜一些比较偏的东西,那就怎么也不行 ...
- 【无标题】导出为Excel文件
如何将layui表格的数据导出为Excel文件? 下图为视图代码 控制器代码如下 public ActionResult ExportExcel() { //查询信息 var list = (from ...
- 【无标题】AD导入CAD文件发现找不到图形
AD导入CAD文件发现找不到图形,位置偏的厉害,这其实是软件bug,按下面方法操作: 1.将CAD文件放大39.37倍,精度越高越好,因为1mm=39.37007874mil;保存文件. 2.打开AD ...
- 【无标题】Linux查看.del文件内容
view + .del文件 退出查看: :q!
- Windows取证——登录过的用户名、新建的用户名和访问的网址文件(墨者学院)
目录 前言知识点: 一.登录过的用户名 二.新建的用户名 法一: 法二:
- Qt 窗口的一些简单设置-标题、图标、最大化最小化按钮、任务栏图标
一些很零散的窗体控制方法,在这总结一些. 1.更改窗体标题 this->setWindowTitle("窗体标题"); 窗体标题"就是更改的窗体标题 2.控制窗体大 ...
最新文章
- 超级详细的 Python 数据分析指南
- TypeScript 3.0下react默认属性DefaultProps解决方案
- vs2022 qt环境搭建调试
- 开发linux显卡驱动,显卡驱动开发DRM入门--Apple的学习笔记
- Razor视图引擎浅析
- 2020蓝桥杯省赛---java---B---6(分类计数)
- 安装inde.html使用babel,reactjs – 使用Babel Standalone进行单个React组件渲染,仅使用index.html和Component...
- 云计算的三种服务模式(SaaS、PasS、IaaS)介绍
- 【NLP】毕设学习笔记(八)“前馈 + 反馈” = 循环神经网络RNN
- PyCharm 下提示 'no module named time'
- Python 从入门到进阶
- 小程序Table样式
- origin 快捷键
- MTSP遗传算法解决
- 模拟购物车系统(添加、修改、查询、结算)(Java实现)
- win10连接filco蓝牙键盘
- Gvim中实现特定行文本的替换
- 「硬核讲解」通达信跨周期引用均线指标公式
- 为什么选php语言做网站,php做网站教程:PHP语言怎么做网站
- 把指针当作动态数组使用