1、获取新浪财经实时股票数据
# =============================================================================
# 9.1 新浪股票实时数据挖掘实战 by 王宇韬
# =============================================================================
from selenium import webdriver
import re
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://finance.sina.com.cn/realstock/company/sh000001/nc.shtml')
data = browser.page_source
# print(data)
browser.quit()
#提取股价的正则表达式
p_price = '<div id="price" class=".*?">(.*?)</div>'
price = re.findall(p_price, data)
print(price)
2、东方财富网数据挖掘实战
# =============================================================================
# 9.2 东方财富网数据挖掘实战 by 王宇韬
# =============================================================================
from selenium import webdriver
import re
def dongfang(company):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
url = 'http://so.eastmoney.com/news/s?keyword=' + company
browser.get(url)
#获得网页数据,存在data里
data = browser.page_source
browser.quit()
# print(data)
p_title = '<div class="news-item"><h3><a href=".*?">(.*?)</a>'
p_href = '<div class="news-item"><h3><a href="(.*?)">.*?</a>'
p_date = '<p class="news-desc">(.*?)</p>'
#从data里提取标题、链接、日期
title = re.findall(p_title,data)
href = re.findall(p_href,data)
date = re.findall(p_date,data,re.S)
for i in range(len(title)):
title[i] = re.sub('<.*?>', '', title[i])
date[i] = date[i].split(' ')[0]
#整合之后输出
print(str(i+1) + '.' + title[i] + ' - '+ date[i])
print(href[i])
companys = ['华能信托', '阿里巴巴', '腾讯', '京东', '万科']
for i in companys:
try:
dongfang(i)
print(i + '该公司东方财富网爬取成功')
except:
print(i + '该公司东方财富网爬取失败')
3、裁判文书网:自动在网页上搜索
# =============================================================================
# 9.3 裁判文书网数据挖掘实战 by 王宇韬
# =============================================================================
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('http://wenshu.court.gov.cn/')
browser.maximize_window()
browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[2]/input').clear() # 清空原搜索框
browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[2]/input').send_keys('房地产') # 在搜索框内模拟输入'房地产'三个字
browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[3]').click() # 点击搜索按钮
time.sleep(10) # 如果还是获取不到你想要的内容,你可以把这个时间再稍微延长一些
data = browser.page_source
browser.quit()
print(data)
4、巨潮资讯网:多个指定关键词的公告信息批量爬取
# =============================================================================
# 9.4 巨潮资讯网数据挖掘实战 by 王宇韬
# =============================================================================
from selenium import webdriver
import re
def juchao(keyword):
browser = webdriver.Chrome()
url = 'http://www.cninfo.com.cn/new/fulltextSearch?notautosubmit=&keyWord=' + keyword
browser.get(url)
data = browser.page_source
# print(data)
browser.quit()
p_title = '<td class="sub-title"><a href=".*?" target="_blank">(.*?)</td>'
p_href = '<td class="sub-title"><a href="(.*?)" target="_blank">.*?</td>'
p_date = '<div class="sub-time-time">(.*?)</div>'
title = re.findall(p_title, data)
href = re.findall(p_href, data)
date = re.findall(p_date, data)
for i in range(len(title)):
title[i] = re.sub(r'<.*?>', '', title[i])
href[i] = 'http://www.cninfo.com.cn' + href[i]
href[i] = re.sub('amp;', '', href[i])
date[i] = date[i].split(' ')[0]
print(str(i + 1) + '.' + title[i] + ' - ' + date[i])
print(href[i])
keywords = ['理财', '现金管理', '纾困']
for i in keywords:
juchao(i)
版权声明:本文为sinat_27184083原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。