| 
 | 
 
 
发表于 2022-6-13 09:55:41
|
显示全部楼层
 
 
 
- import time
 
 - import pandas as pd
 
 - import requests
 
 - import schedule
 
 - from lxml import etree
 
  
- header = {
 
 -     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
 
 -                   'Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'}
 
  
 
- def get_msg():
 
 -     print('I am working...')
 
 -     url = 'https://www.cgmodel.com/'
 
 -     page = requests.get(url, headers=header)
 
 -     tree = etree.HTML(page.text)
 
 -     msg_dic = {}
 
 -     for i in range(1, 11):
 
 -         name = tree.xpath('//ul[@class="sale_ranking"]/li[{}]/p/a'.format(i))[0].text
 
 -         price = tree.xpath('//ul[@class="sale_ranking"]/li[{}]/span'.format(i))[0].text
 
 -         msg_url = url + tree.xpath('//ul[@class="sale_ranking"]/li[{}]/p/a/@href'.format(i))[0]
 
 -         msg_page = requests.get(msg_url)
 
 -         msg_tree = etree.HTML(msg_page.text)
 
 -         tags = []
 
 -         a = 1
 
 -         while True:
 
 -             try:
 
 -                 tags.append(msg_tree.xpath('//*[@id="modeldeals"]/div[2]/div/a[{}]'.format(a))[0].text)
 
 -             except:
 
 -                 break
 
 -             else:
 
 -                 a += 1
 
 -         msg_dic[name] = {'price': price, 'tags': tags}
 
  
-     df = pd.DataFrame(msg_dic).transpose()
 
 -     df.to_csv('test.csv', mode='a+', encoding='ANSI', header=False)#"utf-8"乱码改了"ANSI"
 
  
 
- # print(df)
 
 - schedule.every(10).seconds.do(get_msg)#改了等待时间测试用
 
 - while True:
 
 -     schedule.run_pending()
 
 -     time.sleep(1)
 
  
  复制代码 |   
 
 
 
 |