可直接运行,无需预先创建文件。

# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/7/30 12:57
# author:Sabo"""
改变titleTxtName和titleAndUrlTxtName即可在不同的文件下进行保存相关的标题和网址
"""import os
import requests
from bs4 import BeautifulSouptitleTxtName = "title.txt"
titleAndUrlTxtName = "titleAndUrl2.txt"
root = 'http://show.sctv.com/mlt/index'
urlTail = '.shtml'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36','Connection': 'close'}def getRootUrls(printFlag):rootUrls = []for i in range(0, 10):if i != 0:originUrl = root + '_' + i.__str__() + urlTailelse:originUrl = root + urlTailrootUrls.append(originUrl)if printFlag is True:print(rootUrls)return rootUrlsdef getLinksPerRootUrl(rootUrl, printFlag):response = requests.get(url = rootUrl)if response.status_code != 200:print('Get response error!')return ''else:response.encoding = 'utf-8'txt = response.textmainLink = BeautifulSoup(txt, 'html.parser')childLinks = []mainLink_txt = mainLink.find_all('div', attrs={"class": "txt"})for i in range(0, mainLink_txt.__len__()):link = mainLink_txt[i].find_next('a')href = link.get('href')childLinks.append(href)if printFlag == True:print(childLinks)return childLinksdef catUrl(catFlag, signalLinks):root = 'http://show.sctv.com/mlt'result = []for index in range(0, signalLinks.__len__()):if catFlag == 0:result.append(root+signalLinks[index][1:])else:result.append(root + catFlag.__str__() + signalLinks[index][1:])return resultdef urlTitles(rootUrl):titles = []response = requests.get(url=rootUrl)if response.status_code != 200:print('Get titles error!')return ''response.encoding ='utf-8'txt = response.textmainPage = BeautifulSoup(txt, 'html.parser')nameLinks = mainPage.find_all('div', attrs={'class':'name'})for index in range(0, nameLinks.__len__()):titles.append(nameLinks[index].text)return titlesdef checkTitleExist(dstTitle, checkTitleTxtName):titleList = []with open(checkTitleTxtName, mode="r") as f:for signal in f.readlines():titleList.append(signal.strip())if dstTitle in titleList:print("{0}中已经包含了{1}".format(checkTitleTxtName, dstTitle))f.close()return Truef.close()print("{0}中未包含{1}".format(checkTitleTxtName, dstTitle))return Falsedef appendTitle(dstTitle, appendTitleTxtName):print("正在向{0}中添加{1}".format(appendTitleTxtName, dstTitle))with open(appendTitleTxtName, mode="a") as f:f.write(dstTitle)f.write("\n")f.write("\n")f.close()def appendTitleAndUrl(title, dstUrl, TitleAndUrlTxtName):print("正在向{0}中添加\n{1} : {2}".format(TitleAndUrlTxtName, title, dstUrl))with open(TitleAndUrlTxtName, mode="a") as f:f.write("Title      :       {0}".format(title))f.write("\n")f.write("Url        :       {0}".format(dstUrl))f.write("\n")f.write("\n")f.close()def checkTxtExist(titleTxtName, titleAndUrlTxtName):fileLists = os.listdir()if titleTxtName not in fileLists:print("没有{0}文件,正在创建~".format(titleTxtName))open(titleTxtName, "w").close()if titleAndUrlTxtName not in fileLists:print("没有{0}文件,正在创建~".format(titleAndUrlTxtName))open(titleAndUrlTxtName, "w").close()def main():checkTxtExist(titleTxtName=titleTxtName, titleAndUrlTxtName=titleAndUrlTxtName)RootUrls = getRootUrls(printFlag = False)catFlag = 0for RootUrl in RootUrls:links = getLinksPerRootUrl(rootUrl=RootUrl, printFlag=False)titles = urlTitles(rootUrl=RootUrl)dstUrls = catUrl(catFlag = catFlag, signalLinks=links)for index in range(titles.__len__()):print("-"*50)if checkTitleExist(dstTitle=titles[index], checkTitleTxtName=titleTxtName) is False:appendTitle(dstTitle=titles[index], appendTitleTxtName=titleTxtName)appendTitleAndUrl(TitleAndUrlTxtName=titleAndUrlTxtName, title=titles[index], dstUrl=dstUrls[index])if __name__ == '__main__':main()

加入文件夹创建

# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/7/30 12:57
# author:Sabo"""
改变titleTxtName和titleAndUrlTxtName即可在不同的文件下进行保存相关的标题和网址
"""import os
import requests
from bs4 import BeautifulSoupdirName = "Sctv3Record"
titleTxtName = "title.txt"
titleAndUrlTxtName = "titleAndUrl2.txt"
root = 'http://show.sctv.com/mlt/index'
urlTail = '.shtml'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36','Connection': 'close'}def getRootUrls(printFlag):rootUrls = []for i in range(0, 10):if i != 0:originUrl = root + '_' + i.__str__() + urlTailelse:originUrl = root + urlTailrootUrls.append(originUrl)if printFlag is True:print(rootUrls)return rootUrlsdef getLinksPerRootUrl(rootUrl, printFlag):response = requests.get(url = rootUrl)if response.status_code != 200:print('Get response error!')return ''else:response.encoding = 'utf-8'txt = response.textmainLink = BeautifulSoup(txt, 'html.parser')childLinks = []mainLink_txt = mainLink.find_all('div', attrs={"class": "txt"})for i in range(0, mainLink_txt.__len__()):link = mainLink_txt[i].find_next('a')href = link.get('href')childLinks.append(href)if printFlag == True:print(childLinks)return childLinksdef catUrl(catFlag, signalLinks):root = 'http://show.sctv.com/mlt'result = []for index in range(0, signalLinks.__len__()):if catFlag == 0:result.append(root+signalLinks[index][1:])else:result.append(root + catFlag.__str__() + signalLinks[index][1:])return resultdef urlTitles(rootUrl):titles = []response = requests.get(url=rootUrl)if response.status_code != 200:print('Get titles error!')return ''response.encoding ='utf-8'txt = response.textmainPage = BeautifulSoup(txt, 'html.parser')nameLinks = mainPage.find_all('div', attrs={'class':'name'})for index in range(0, nameLinks.__len__()):titles.append(nameLinks[index].text)return titlesdef checkTitleExist(dstTitle, checkTitleTxtName):titleList = []with open(checkTitleTxtName, mode="r") as f:for signal in f.readlines():titleList.append(signal.strip())if dstTitle in titleList:print("{0}中已经包含了{1}".format(checkTitleTxtName, dstTitle))f.close()return Truef.close()print("{0}中未包含{1}".format(checkTitleTxtName, dstTitle))return Falsedef appendTitle(dstTitle, appendTitleTxtName):print("正在向{0}中添加{1}".format(appendTitleTxtName, dstTitle))with open(appendTitleTxtName, mode="a") as f:f.write(dstTitle)f.write("\n")f.write("\n")f.close()def appendTitleAndUrl(title, dstUrl, TitleAndUrlTxtName):print("正在向{0}中添加\n{1} : {2}".format(TitleAndUrlTxtName, title, dstUrl))with open(TitleAndUrlTxtName, mode="a") as f:f.write("Title      :       {0}".format(title))f.write("\n")f.write("Url        :       {0}".format(dstUrl))f.write("\n")f.write("\n")f.close()def checkTxtExist(titleTxtName, titleAndUrlTxtName):fileLists = os.listdir()if titleTxtName not in fileLists:print("没有{0}文件,正在创建~".format(titleTxtName))open(titleTxtName, "w").close()if titleAndUrlTxtName not in fileLists:print("没有{0}文件,正在创建~".format(titleAndUrlTxtName))open(titleAndUrlTxtName, "w").close()def checkDir(dirName):lists = os.listdir()if dirName not in lists:os.mkdir(dirName)def main():global titleTxtNameglobal titleAndUrlTxtNamecheckDir(dirName=dirName)titleTxtName=dirName+"/"+titleTxtNametitleAndUrlTxtName=dirName+"/"+titleAndUrlTxtNamecheckTxtExist(titleTxtName=titleTxtName, titleAndUrlTxtName=titleAndUrlTxtName)RootUrls = getRootUrls(printFlag = False)catFlag = 0for RootUrl in RootUrls:links = getLinksPerRootUrl(rootUrl=RootUrl, printFlag=False)titles = urlTitles(rootUrl=RootUrl)dstUrls = catUrl(catFlag = catFlag, signalLinks=links)for index in range(titles.__len__()):print("-"*50)if checkTitleExist(dstTitle=titles[index], checkTitleTxtName=titleTxtName) is False:appendTitle(dstTitle=titles[index], appendTitleTxtName=titleTxtName)appendTitleAndUrl(TitleAndUrlTxtName=titleAndUrlTxtName, title=titles[index], dstUrl=dstUrls[index])if __name__ == '__main__':main()

更正

# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/7/30 12:57
# author:Sabo"""
改变titleTxtName和titleAndUrlTxtName即可在不同的文件下进行保存相关的标题和网址
"""import os
import requests
from bs4 import BeautifulSoupdirName = "Sctv3Record"
titleTxtName = "title.txt"
titleAndUrlTxtName = "titleAndUrl2.txt"
root = 'http://show.sctv.com/mlt/index'
urlTail = '.shtml'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36','Connection': 'close'
}def getRootUrls(printFlag):rootUrls = []for i in range(0, 10):if i != 0:originUrl = root + '_' + i.__str__() + urlTailelse:originUrl = root + urlTailrootUrls.append(originUrl)if printFlag is True:print(rootUrls)return rootUrlsdef getLinksPerRootUrl(rootUrl, printFlag):response = requests.get(url=rootUrl, headers=headers)if response.status_code != 200:print('Get response error!')return ''else:response.encoding = 'utf-8'txt = response.textmainLink = BeautifulSoup(txt, 'html.parser')childLinks = []mainLink_txt = mainLink.find_all('div', attrs={"class": "txt"})for i in range(0, mainLink_txt.__len__()):link = mainLink_txt[i].find_next('a')href = link.get('href')childLinks.append(href)if printFlag == True:print(childLinks)return childLinksdef catUrl(catFlag, signalLinks):root = 'http://show.sctv.com/mlt'result = []for index in range(0, signalLinks.__len__()):if catFlag == 0:result.append(root + signalLinks[index][1:])else:result.append(root + catFlag.__str__() + signalLinks[index][1:])return resultdef urlTitles(rootUrl):titles = []response = requests.get(url=rootUrl, headers=headers)if response.status_code != 200:print('Get titles error!')return ''response.encoding = 'utf-8'txt = response.textmainPage = BeautifulSoup(txt, 'html.parser')nameLinks = mainPage.find_all('div', attrs={'class': 'name'})for index in range(0, nameLinks.__len__()):titles.append(nameLinks[index].text)return titlesdef checkTitleExist(dstTitle, checkTitleTxtName):titleList = []with open(checkTitleTxtName, mode="r") as f:for signal in f.readlines():titleList.append(signal.strip())if dstTitle in titleList:print("{0}中已经包含了{1}".format(checkTitleTxtName, dstTitle))f.close()return Truef.close()print("{0}中未包含{1}".format(checkTitleTxtName, dstTitle))return Falsedef appendTitle(dstTitle, appendTitleTxtName):print("正在向{0}中添加{1}".format(appendTitleTxtName, dstTitle))with open(appendTitleTxtName, mode="a") as f:f.write(dstTitle)f.write("\n")f.write("\n")f.close()def appendTitleAndUrl(title, dstUrl, TitleAndUrlTxtName):print("正在向{0}中添加\n{1} : {2}".format(TitleAndUrlTxtName, title, dstUrl))with open(TitleAndUrlTxtName, mode="a") as f:f.write("Title      :       {0}".format(title))f.write("\n")f.write("Url        :       {0}".format(dstUrl))f.write("\n")f.write("\n")f.close()def checkTxtExist(dirName, titleTxtName, titleAndUrlTxtName):fileLists = os.listdir(path=dirName)if titleTxtName not in fileLists:print("没有{0}文件,正在创建~".format(titleTxtName))open(titleTxtName, "w").close()if titleAndUrlTxtName not in fileLists:print("没有{0}文件,正在创建~".format(titleAndUrlTxtName))open(titleAndUrlTxtName, "w").close()def checkDir(dirName):lists = os.listdir()if dirName not in lists:os.mkdir(dirName)def main():global titleTxtNameglobal titleAndUrlTxtNamecheckDir(dirName=dirName)checkTxtExist(titleTxtName=titleTxtName, titleAndUrlTxtName=titleAndUrlTxtName, dirName=dirName)titleTxtName = dirName + "/" + titleTxtNametitleAndUrlTxtName = dirName + "/" + titleAndUrlTxtNameRootUrls = getRootUrls(printFlag=False)catFlag = 0for RootUrl in RootUrls:links = getLinksPerRootUrl(rootUrl=RootUrl, printFlag=False)titles = urlTitles(rootUrl=RootUrl)dstUrls = catUrl(catFlag=catFlag, signalLinks=links)for index in range(titles.__len__()):print("-" * 50)if checkTitleExist(dstTitle=titles[index], checkTitleTxtName=titleTxtName) is False:appendTitle(dstTitle=titles[index], appendTitleTxtName=titleTxtName)appendTitleAndUrl(TitleAndUrlTxtName=titleAndUrlTxtName, title=titles[index], dstUrl=dstUrls[index])if __name__ == '__main__':main()

麻辣烫的标题和对应的网址文件化相关推荐

  1. 根据文献标题免费下载PDF格式文件的文献内容

    根据文献标题免费下载PDF格式文件的文献内容 通过文献名下载文献:较为通用,如有不能下载情况请留言. 第一步:根据文献名,查出文献的DOI. 进入https://www.crossref.org/ , ...

  2. dfmea文件_PFMEA执行之步骤七:结果文件化 vs 高层管理者的承诺

    0.本章大纲 概览 PFMEA执行报告 高层管理者的承诺 vs 持续改进 PFMEA执行步骤七之输出物 常见不符合项 1.概览 结果文件化步骤的目的是,针对PFMEA活动的结果进行总结和交流 将过程结 ...

  3. wp博客链接.html,WordPress博客文章标题链接到自定义网址链接

    WordPress 支持多种文章形式,相信不少博主遇到过需要把wordpress文章的标题链接到自定义的URL地址的情况,点击文章链接时不是直接访问文章详情页面,而是直接访问分享的链接网页,就像微博一 ...

  4. 百度搜索技巧,精确搜索,搜索指定标题、内容、网址,黑语法搜索入门

    经常用百度的话,有时有想搜的内容但是却不能搜到,搜出一堆虽然有关键词但是根本不是你想要的内容. 因为百度有他的算法,大多数时候他能给你最想要的结果,但是有的时候,你想搜一些比较偏的东西,那就怎么也不行 ...

  5. 【无标题】导出为Excel文件

    如何将layui表格的数据导出为Excel文件? 下图为视图代码 控制器代码如下 public ActionResult ExportExcel() { //查询信息 var list = (from ...

  6. 【无标题】AD导入CAD文件发现找不到图形

    AD导入CAD文件发现找不到图形,位置偏的厉害,这其实是软件bug,按下面方法操作: 1.将CAD文件放大39.37倍,精度越高越好,因为1mm=39.37007874mil;保存文件. 2.打开AD ...

  7. 【无标题】Linux查看.del文件内容

    view + .del文件 退出查看: :q!

  8. Windows取证——登录过的用户名、新建的用户名和访问的网址文件(墨者学院)

    目录 前言知识点: 一.登录过的用户名 二.新建的用户名 法一: 法二:

  9. Qt 窗口的一些简单设置-标题、图标、最大化最小化按钮、任务栏图标

    一些很零散的窗体控制方法,在这总结一些. 1.更改窗体标题 this->setWindowTitle("窗体标题"); 窗体标题"就是更改的窗体标题 2.控制窗体大 ...

最新文章

  1. 超级详细的 Python 数据分析指南
  2. TypeScript 3.0下react默认属性DefaultProps解决方案
  3. vs2022 qt环境搭建调试
  4. 开发linux显卡驱动,显卡驱动开发DRM入门--Apple的学习笔记
  5. Razor视图引擎浅析
  6. 2020蓝桥杯省赛---java---B---6(分类计数)
  7. 安装inde.html使用babel,reactjs – 使用Babel Standalone进行单个React组件渲染,仅使用index.html和Component...
  8. 云计算的三种服务模式(SaaS、PasS、IaaS)介绍
  9. 【NLP】毕设学习笔记(八)“前馈 + 反馈” = 循环神经网络RNN
  10. PyCharm 下提示 'no module named time'
  11. Python 从入门到进阶
  12. 小程序Table样式
  13. origin 快捷键
  14. MTSP遗传算法解决
  15. 模拟购物车系统(添加、修改、查询、结算)(Java实现)
  16. win10连接filco蓝牙键盘
  17. Gvim中实现特定行文本的替换
  18. 「硬核讲解」通达信跨周期引用均线指标公式
  19. 为什么选php语言做网站,php做网站教程:PHP语言怎么做网站
  20. 把指针当作动态数组使用

热门文章

  1. Centos7用户,组及文件权限管理
  2. 如何设计RESTful风格API
  3. JumpServer堡垒机
  4. sql min函数_SQL min()和max()函数
  5. ipad iphone开发_如何解锁iPhone或iPad
  6. 国内第一国际第二!首都机场年旅客吞吐量破亿
  7. 2021-湖湘杯final-Web
  8. 酞菁铅(PbPc),CAS:15187-16-3,PHTHALOCYANINELEAD齐岳定制酞菁材料
  9. 阴阳师服务器维护 2月11,阴阳师2月11日体验服维护详情 2月11日何时开服?
  10. SQL中DATEPART的应用