导入模块:
re:(Regular Expression 正则表达式),提取满足要求的信息(字符串)
requests:发送网络请求,返回响应数据(连接到对方网站)
traceback:抛出详细的异常信息
BeautifulSoup:Python解析html或xml的库
code:
import requests
from bs4 import BeautifulSoup
import traceback
import re
def getHTMLText(url):#传入地址参数,返回响应内容
try:
r = requests.get(url, timeout=30)
#设置必须在3s内收到响应,不然或抛出ReadTimeout异常
r.raise_for_status()#出错返回状态码,如404
r.encoding = r.apparent_encoding#将头编码设置为内容响应编码
return r.text#返回响应内容
except:
return “”
def getStockList(lst,stockURL):#获取航班号
html = getHTMLText(stockURL)
soup = BeautifulSoup(html,’html.parser’)#使用返回内容创建BeautifulSoup对象
a = soup.find_all(‘a’)#查找所有a标签(其中包括航班号信息)
for i in a:
try:
title = i.attrs[“title”]#在a标签中查找title部分
lst.append(re.findall(r”^[A-Za-z0-9]+$”,title)[0])#在title中使用正则表达式匹配航班号,添加在列表中
except:
continue
def getStockInfo(lst,stockURL,fpath):#获取详细的航班信息
for stock in lst:
url = stockURL + stock +”.html”
html = getHTMLText(url)#返回每一趟航班具体的网页信息
try:
if html == “”:
continue
infoDict = []
soup = BeautifulSoup(html,’html.parser’)#解析新的网页
stockInfo = soup.find(‘div’,attrs={‘class’:’detail-info’})#获取div标签中的detail-info信息
name = stockInfo.find_all(attrs={‘class’:’ml5′})[1]#在stockInfo中查找航班号
infoDict.append(‘航班号’+”:”+name.text.split()[0])#添加航班号
keyList = stockInfo.find_all(attrs={‘class’:’time’})#时间
value = stockInfo.find_all(attrs={‘class’:’strong’})[1]#登机口
for i in range(len(keyList)):#时间分为到达和出发时间,这里将其分割为两部分添加在空列表中保存
key = keyList[i].text
infoDict.append(key)
val = value.text#登机口信息中前面数据为\n,使用切片清除
val = str(val)
val = val[1:4]
infoDict.append(val)
print(infoDict) #控制台打印信息
with open(fpath,’a’,encoding=’utf-8′)as f:#保存在指定路径
f.write(str(infoDict)+’\n’)
except:
traceback.print_exc() #出错打印异常信息
continue
def main():
depth = 46#查询深度
stock_list_url=’http://flights.ctrip.com/actualtime/depart-ctu/’
stock_info_url=’http://flights.ctrip.com/actualtime/fno–‘
output_file = ‘D://11.txt’
slist = []
getStockList(slist,stock_list_url)
getStockInfo(slist,stock_info_url,output_file)
for i in range(1,depth):
try:
print(“第”+str(i)+”页”)
slist = []
stock_list_url =’http://flights.ctrip.com/actualtime/depart-ctu’+’.p’+str(i)+’/’#每一页航班号具体的url
getStockList(slist,stock_list_url)
getStockInfo(slist,stock_info_url,output_file)
except:
continue
main()
截图: