import requestsfrom lxml import etreeimport jsonimport pymongo # 连接mongodb 数据库 存mongodb中client = pymongo.MongoClient('127.0.0.1', port=27017)db = client.xueqiucollection = db.xueqiu url = 'https://xueqiu.com/'headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}session = requests.Session()session.get(url=url,headers=headers)def get_page_list(): url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1' params = { "since_id": "-1", "max_id": "-1", "count": "10", "category": "-1" } response = session.get(url=url,headers=headers,params=params) page_text = response.json() content = page_text["list"] info_dict = {} for x in content: per_info = x["data"] #json 格式 per_info = json.loads(per_info) # print(per_info) id = per_info["id"] title = per_info["title"] description = per_info["description"] target = per_info["target"] detail_url = "https://xueqiu.com"+target info_dict['id']=id info_dict['title'] = title info_dict['detail_url']=detail_url parse_detail(detail_url) # breakdef parse_detail(url): response = session.get(url=url, headers=headers,) page_text = response.text tree = etree.HTML(page_text) title = tree.xpath('//div[@class="container article__container"]//h1[@class="article__bd__title"]/text()') print(title) print("=="*20) data_dict = {} data_dict["title"] = title p_list = tree.xpath('//div[@class="article__bd__detail"]/p') content_list = [] for p in p_list: content = p.xpath('./text()|./b/text()') content = "".join(content).strip() # print(content) if len(content)>0: content_list.append(content) content_str = "".join(content_list) data_dict["content"] = content_str # print(data_dict) collection.insert([data_dict])def main(): get_page_list()if __name__ == '__main__': main()
优化成redis增量式获取数据
import requestsfrom lxml import etreeimport jsonfrom redis import Redisimport pymongoimport timeimport datetimeclient = pymongo.MongoClient('127.0.0.1', port=27017)db = client.xueqiucollection = db.xueqiuconn = Redis(host='127.0.0.1',port=6379)url = 'https://xueqiu.com/'headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}session = requests.Session()session.get(url=url,headers=headers)def get_page_list(): url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1' params = { "since_id": "-1", "max_id": "-1", "count": "10", # 获取10条信息 "category": "-1" } response = session.get(url=url,headers=headers,params=params) page_text = response.json() content = page_text["list"] info_dict = {} for x in content: per_info = x["data"] #json 格式 per_info = json.loads(per_info) # print(per_info) id = per_info["id"] title = per_info["title"] description = per_info["description"] target = per_info["target"] detail_url = "https://xueqiu.com"+target # url去重 ex = conn.sadd('news_urls', detail_url) if ex == 0: print('暂无最新数据可爬取......') else: print('有最新数据的更新......') info_dict['id']=id info_dict['title'] = title info_dict['detail_url']=detail_url parse_detail(detail_url) # breakdef parse_detail(url): response = session.get(url=url, headers=headers,) page_text = response.text tree = etree.HTML(page_text) title = tree.xpath('//div[@class="container article__container"]//h1[@class="article__bd__title"]/text()') print(title) print("=="*20) data_dict = {} data_dict["title"] = title p_list = tree.xpath('//div[@class="article__bd__detail"]/p') content_list = [] for p in p_list: content = p.xpath('./text()|./b/text()') content = "".join(content).strip() # print(content) if len(content)>0: content_list.append(content) content_str = "".join(content_list) data_dict["content"] = content_str # print(data_dict) collection.insert([data_dict])def main(): flag = 0 now = datetime.datetime.now() sched_time = datetime.datetime(now.year, now.month, now.day, now.hour, now.minute, now.second) + datetime.timedelta( seconds=6) while True: now = datetime.datetime.now() # 设置爬取时间, 多久爬一次 if sched_time < now: time.sleep(300) print(now) get_page_list() flag = 1 else: if flag == 1: sched_time = sched_time+datetime.timedelta(minutes=1) flag = 0if __name__ == '__main__': main()