博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
雪球数据的定时爬取
阅读量:4916 次
发布时间:2019-06-11

本文共 4955 字,大约阅读时间需要 16 分钟。

import requestsfrom lxml import etreeimport jsonimport pymongo # 连接mongodb 数据库  存mongodb中client = pymongo.MongoClient('127.0.0.1', port=27017)db = client.xueqiucollection = db.xueqiu url = 'https://xueqiu.com/'headers = {    "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}session = requests.Session()session.get(url=url,headers=headers)def get_page_list():    url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1'    params = {        "since_id": "-1",        "max_id": "-1",        "count": "10",        "category": "-1"    }    response = session.get(url=url,headers=headers,params=params)    page_text = response.json()    content = page_text["list"]    info_dict = {}    for x in content:        per_info = x["data"] #json 格式        per_info = json.loads(per_info)        # print(per_info)        id  = per_info["id"]        title = per_info["title"]        description = per_info["description"]        target = per_info["target"]        detail_url = "https://xueqiu.com"+target        info_dict['id']=id        info_dict['title'] = title        info_dict['detail_url']=detail_url        parse_detail(detail_url)        # breakdef parse_detail(url):    response = session.get(url=url, headers=headers,)    page_text = response.text    tree = etree.HTML(page_text)    title = tree.xpath('//div[@class="container article__container"]//h1[@class="article__bd__title"]/text()')    print(title)    print("=="*20)    data_dict = {}    data_dict["title"] = title    p_list = tree.xpath('//div[@class="article__bd__detail"]/p')    content_list = []    for p in p_list:        content = p.xpath('./text()|./b/text()')        content = "".join(content).strip()        # print(content)        if len(content)>0:            content_list.append(content)    content_str = "".join(content_list)    data_dict["content"] = content_str    # print(data_dict)    collection.insert([data_dict])def main():    get_page_list()if __name__ == '__main__':    main()

 

优化成redis增量式获取数据

import requestsfrom lxml import etreeimport jsonfrom redis import Redisimport pymongoimport timeimport datetimeclient = pymongo.MongoClient('127.0.0.1', port=27017)db = client.xueqiucollection = db.xueqiuconn = Redis(host='127.0.0.1',port=6379)url = 'https://xueqiu.com/'headers = {    "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}session = requests.Session()session.get(url=url,headers=headers)def get_page_list():    url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1'    params = {        "since_id": "-1",        "max_id": "-1",        "count": "10", # 获取10条信息        "category": "-1"    }    response = session.get(url=url,headers=headers,params=params)    page_text = response.json()    content = page_text["list"]    info_dict = {}    for x in content:        per_info = x["data"] #json 格式        per_info = json.loads(per_info)        # print(per_info)        id  = per_info["id"]        title = per_info["title"]        description = per_info["description"]        target = per_info["target"]        detail_url = "https://xueqiu.com"+target      # url去重        ex = conn.sadd('news_urls', detail_url)        if ex == 0:            print('暂无最新数据可爬取......')        else:            print('有最新数据的更新......')            info_dict['id']=id            info_dict['title'] = title            info_dict['detail_url']=detail_url            parse_detail(detail_url)        # breakdef parse_detail(url):    response = session.get(url=url, headers=headers,)    page_text = response.text    tree = etree.HTML(page_text)    title = tree.xpath('//div[@class="container article__container"]//h1[@class="article__bd__title"]/text()')    print(title)    print("=="*20)    data_dict = {}    data_dict["title"] = title    p_list = tree.xpath('//div[@class="article__bd__detail"]/p')    content_list = []    for p in p_list:        content = p.xpath('./text()|./b/text()')        content = "".join(content).strip()        # print(content)        if len(content)>0:            content_list.append(content)    content_str = "".join(content_list)    data_dict["content"] = content_str    # print(data_dict)    collection.insert([data_dict])def main():    flag = 0    now = datetime.datetime.now()    sched_time = datetime.datetime(now.year, now.month, now.day, now.hour, now.minute, now.second) + datetime.timedelta(        seconds=6)    while True:        now = datetime.datetime.now()   # 设置爬取时间, 多久爬一次        if sched_time < now:            time.sleep(300)            print(now)            get_page_list()            flag = 1        else:            if flag == 1:                sched_time = sched_time+datetime.timedelta(minutes=1)                flag = 0if __name__ == '__main__':    main()

 

转载于:https://www.cnblogs.com/kenD/p/11123726.html

你可能感兴趣的文章
python解决上楼梯问题
查看>>
变参宏 __VA_ARGS__
查看>>
sql 语句
查看>>
VUE一 基础语法
查看>>
[MySQl]MySQL忘记密码
查看>>
Android的minSdkVersion,targetSdkVersion,maxSdkVersion
查看>>
Xceed WinForm数据表格控件Xceed Grid For .NET控件详细介绍及下载地址
查看>>
ecos启动流程分析
查看>>
Oracle CASE WHEN 用法介绍
查看>>
linux 下连接mysql服务器
查看>>
DOMContentLoad 首屏渲染
查看>>
rpm检验是否被改动过
查看>>
Sphinx-简介及原理
查看>>
【Linux】深入理解Linux中内存管理
查看>>
WEB 移动网站 手机点击 打电话 发短信
查看>>
2019CSUST集训队选拔赛题解(一)
查看>>
李晓菁201771010114《面向对象程序设计(Java)》第三周学习总结
查看>>
Typedef与Struct
查看>>
Linux常用网络命令整理
查看>>
JMeter学习笔记--使用URL回写来处理用户会话
查看>>