1、获取新浪财经实时股票数据

# =============================================================================
# 9.1 新浪股票实时数据挖掘实战 by 王宇韬
# =============================================================================

from selenium import webdriver
import re
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://finance.sina.com.cn/realstock/company/sh000001/nc.shtml')
data = browser.page_source
# print(data)
browser.quit()
#提取股价的正则表达式
p_price = '<div id="price" class=".*?">(.*?)</div>'
price = re.findall(p_price, data)
print(price)

2、东方财富网数据挖掘实战

# =============================================================================
# 9.2 东方财富网数据挖掘实战 by 王宇韬
# =============================================================================

from selenium import webdriver
import re


def dongfang(company):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(chrome_options=chrome_options)
    url = 'http://so.eastmoney.com/news/s?keyword=' + company
    browser.get(url)
    #获得网页数据,存在data里
    data = browser.page_source
    browser.quit()
    # print(data)

    p_title = '<div class="news-item"><h3><a href=".*?">(.*?)</a>'
    p_href = '<div class="news-item"><h3><a href="(.*?)">.*?</a>'
    p_date = '<p class="news-desc">(.*?)</p>'
    #从data里提取标题、链接、日期
    title = re.findall(p_title,data)
    href = re.findall(p_href,data)
    date = re.findall(p_date,data,re.S)

    for i in range(len(title)):
        title[i] = re.sub('<.*?>', '', title[i])
        date[i] = date[i].split(' ')[0]
        #整合之后输出
        print(str(i+1) + '.' + title[i] + ' - '+ date[i])
        print(href[i])


companys = ['华能信托', '阿里巴巴', '腾讯', '京东', '万科']
for i in companys:
    try:
        dongfang(i)
        print(i + '该公司东方财富网爬取成功')
    except:
        print(i + '该公司东方财富网爬取失败')


3、裁判文书网:自动在网页上搜索

# =============================================================================
# 9.3 裁判文书网数据挖掘实战 by 王宇韬
# =============================================================================

from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('http://wenshu.court.gov.cn/')
browser.maximize_window()

browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[2]/input').clear()  # 清空原搜索框
browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[2]/input').send_keys('房地产')  # 在搜索框内模拟输入'房地产'三个字
browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[3]').click()  # 点击搜索按钮
time.sleep(10)  # 如果还是获取不到你想要的内容,你可以把这个时间再稍微延长一些
data = browser.page_source
browser.quit()
print(data)

4、巨潮资讯网:多个指定关键词的公告信息批量爬取

# =============================================================================
# 9.4 巨潮资讯网数据挖掘实战 by 王宇韬
# =============================================================================

from selenium import webdriver
import re

def juchao(keyword):
    browser = webdriver.Chrome()
    url = 'http://www.cninfo.com.cn/new/fulltextSearch?notautosubmit=&keyWord=' + keyword
    browser.get(url)
    data = browser.page_source
    # print(data)
    browser.quit()

    p_title = '<td class="sub-title"><a href=".*?" target="_blank">(.*?)</td>'
    p_href = '<td class="sub-title"><a href="(.*?)" target="_blank">.*?</td>'
    p_date = '<div class="sub-time-time">(.*?)</div>'
    title = re.findall(p_title, data)
    href = re.findall(p_href, data)
    date = re.findall(p_date, data)

    for i in range(len(title)):
        title[i] = re.sub(r'<.*?>', '', title[i])
        href[i] = 'http://www.cninfo.com.cn' + href[i]
        href[i] = re.sub('amp;', '', href[i])
        date[i] = date[i].split(' ')[0]
        print(str(i + 1) + '.' + title[i] + ' - ' + date[i])
        print(href[i])

keywords = ['理财', '现金管理', '纾困']
for i in keywords:
    juchao(i)

 


版权声明:本文为sinat_27184083原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
原文链接:https://blog.csdn.net/sinat_27184083/article/details/103349457