导入模块:

re:(Regular Expression 正则表达式),提取满足要求的信息(字符串)

requests:发送网络请求,返回响应数据(连接到对方网站)

traceback:抛出详细的异常信息

BeautifulSoup:Python解析html或xml的库

code:

import requests

from bs4 import BeautifulSoup

import traceback

import re

def getHTMLText(url):#传入地址参数,返回响应内容

try:

r = requests.get(url, timeout=30)

#设置必须在3s内收到响应,不然或抛出ReadTimeout异常


r.raise_for_status()#出错返回状态码,如404

r.encoding = r.apparent_encoding#将头编码设置为内容响应编码

return r.text#返回响应内容

except:

return “”

def getStockList(lst,stockURL):#获取航班号

html = getHTMLText(stockURL)

soup = BeautifulSoup(html,’html.parser’)#使用返回内容创建BeautifulSoup对象

a = soup.find_all(‘a’)#查找所有a标签(其中包括航班号信息)

for i in a:

try:

title = i.attrs[“title”]#在a标签中查找title部分

lst.append(re.findall(r”^[A-Za-z0-9]+$”,title)[0])#在title中使用正则表达式匹配航班号,添加在列表中

except:

continue

def getStockInfo(lst,stockURL,fpath):#获取详细的航班信息

for stock in lst:

url = stockURL + stock +”.html”

html = getHTMLText(url)#返回每一趟航班具体的网页信息

try:

if html == “”:

continue

infoDict = []

soup = BeautifulSoup(html,’html.parser’)#解析新的网页

stockInfo = soup.find(‘div’,attrs={‘class’:’detail-info’})#获取div标签中的detail-info信息

name = stockInfo.find_all(attrs={‘class’:’ml5′})[1]#在stockInfo中查找航班号

infoDict.append(‘航班号’+”:”+name.text.split()[0])#添加航班号

keyList = stockInfo.find_all(attrs={‘class’:’time’})#时间

value = stockInfo.find_all(attrs={‘class’:’strong’})[1]#登机口

for i in range(len(keyList)):#时间分为到达和出发时间,这里将其分割为两部分添加在空列表中保存

key = keyList[i].text

infoDict.append(key)

val = value.text#登机口信息中前面数据为\n,使用切片清除

val = str(val)

val = val[1:4]

infoDict.append(val)

print(infoDict)      #控制台打印信息

with open(fpath,’a’,encoding=’utf-8′)as f:#保存在指定路径

f.write(str(infoDict)+’\n’)

except:

traceback.print_exc() #出错打印异常信息

continue

def main():

depth = 46#查询深度

stock_list_url=’http://flights.ctrip.com/actualtime/depart-ctu/’

stock_info_url=’http://flights.ctrip.com/actualtime/fno–‘

output_file = ‘D://11.txt’

slist = []

getStockList(slist,stock_list_url)

getStockInfo(slist,stock_info_url,output_file)

for i in range(1,depth):

try:

print(“第”+str(i)+”页”)

slist = []

stock_list_url =’http://flights.ctrip.com/actualtime/depart-ctu’+’.p’+str(i)+’/’#每一页航班号具体的url

getStockList(slist,stock_list_url)

getStockInfo(slist,stock_info_url,output_file)

except:

continue

main()

截图:




版权声明:本文为weixin_36699843原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
原文链接:https://blog.csdn.net/weixin_36699843/article/details/82314553