|
|

楼主 |
发表于 2016-11-28 09:53:53
|
显示全部楼层
自己解决
去重失败原因:每次我在往list中添加数据的时候,都是新建的类对象,id不一致
因此增加一个list_ticket_id对象,用来以str形式存储车次号,如果已存在,说明有重复数据
不过我担心以后如果改成查询多个站点到多个站点,就会出现缺失数据的现象,难道只能改成 车次+发车时间+到站时间 吗?有没有其他方法?
另外,因为我在list和set中添加类对象时去重失败,有两个问题需要问一下 :
1.set去重是根据什么去的?id吗?
2.if a in list: in也是根据id判断的吗?
完整修复代码如下
- import requests
- import re
- import itertools
- import json
- import smtplib
- from email.mime.text import MIMEText
- import logging
- import time
- import random
- logging.basicConfig(level=logging.INFO,
- format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
- datefmt='%a, %d %b %Y %H:%M:%S',
- filename='./' + time.strftime('%Y-%m-%d', time.localtime()) + 'test.log',
- filemode='w')
- url_base = "https://kyfw.12306.cn"
- url_init = "https://kyfw.12306.cn/otn/leftTicket/init"
- url_query_base = "https://kyfw.12306.cn/otn/leftTicket/queryX?" \
- "leftTicketDTO.train_date=%s" \
- "&leftTicketDTO.from_station=%s" \
- "&leftTicketDTO.to_station=%s" \
- "&purpose_codes=ADULT"
- table_html = '''<table align="center">
- <tbody align="center">
- <tr>
- <th>车次</th>
- <th>是否始发站发车</th>
- <th>起始站</th>
- <th>发车时间</th>
- <th>抵达</th>
- <th>到达时间</th>
- <th>历时</th>
- <th>当前最晚售票日期</th>
- <th>售票按钮文字</th>
- </tr>
- @
- </tbody>
- </table>
- '''
- headers = {
- 'Accept': '*/*',
- 'Accept-Encoding': 'gzip, deflate, sdch, br',
- 'Accept-Language': 'zh-CN,zh;q=0.8',
- 'Cache-Control': 'no-cache',
- 'Connection': 'keep-alive',
- 'Host': 'kyfw.12306.cn',
- 'If-Modified-Since': '0',
- 'Referer': 'https://kyfw.12306.cn/otn/leftTicket/init',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36',
- 'X-Requested-With': 'XMLHttpRequest'}
- # 站点编码文件
- file_station_name = "station_name.txt"
- from_station = "杭州"
- to_station = "三门峡"
- train_date = '2017-01-22'
- user_addr = '*****@163.com'
- user_password = '*****'
- user_host = 'smtp.163.com'
- to_list = ['*****@qq.com']
- session = requests.Session()
- list_ticket = []
- list_ticket_id = []
- class Ticket(object):
- __slots__ = (
- 'station_train_code', 'is_start_station', 'from_station_name', 'start_time', 'to_station_name', 'arrive_time',
- 'lishi', 'control_train_day', 'buttonTextInfo') # 用tuple定义允许绑定的属性名称
- def __init__(self, station_train_code, is_start_station, from_station_name, start_time,
- to_station_name, arrive_time, lishi, control_train_day, buttonTextInfo):
- self.station_train_code = station_train_code
- self.is_start_station = is_start_station
- self.from_station_name = from_station_name
- self.start_time = start_time
- self.to_station_name = to_station_name
- self.arrive_time = arrive_time
- self.lishi = lishi
- self.control_train_day = control_train_day
- self.buttonTextInfo = buttonTextInfo
- def to_str(self):
- return "<tr><td>" + self.station_train_code \
- + "</td><td>" + self.is_start_station \
- + "</td><td>" + self.from_station_name \
- + "</td><td>" + self.start_time \
- + "</td><td>" + self.to_station_name \
- + "</td><td>" + self.arrive_time \
- + "</td><td>" + self.lishi \
- + "</td><td>" + self.control_train_day \
- + "</td><td>" + self.buttonTextInfo + "</td></tr>"
- def query(url):
- resp, resp_html = open_url(url)
- pattern = re.compile(r'<script.{,200}<\/script>', re.I | re.M)
- lists = pattern.findall(resp_html)
- # station_name是车站名与编号的对应文件
- url_station_name = url_base + lists[-4].split('"')[3]
- resp, content_station_name = open_url(url_station_name)
- content_station_name = content_station_name[20:-2]
- if resp.status_code is 200:
- with open(file_station_name, 'w', encoding='UTF-8') as f:
- f.truncate()
- f.write(content_station_name)
- logging.info('修改%s文件' % file_station_name)
- else:
- logging.error('获取%s文件失败,status_code = %s' % (file_station_name, resp.status_code))
- list_from_station = query_station(staion_name=from_station, file_content=content_station_name)
- list_to_station = query_station(staion_name=to_station, file_content=content_station_name)
- if (len(list_from_station) > 0) and (len(list_to_station) > 0):
- # 计数,有时候会出现请求错误、或没有相应直达车次,连续失败3次后跳过
- global count
- count = 0
- # 笛卡尔积
- for x in itertools.product(list_from_station, list_to_station):
- url_query = url_query_base % (train_date, x[0], x[1])
- # 不等于0说明失败
- while count < 3:
- count = query_ticket(url_query, x, count)
- count = 0
- continue
- content_email = ''
- for ticket in list(set(list_ticket)): # 就是这里错了,转换后还是有重复数据
- content_email += ticket.to_str()
- send_simple_txt_email(to_list, title="每日12306监控邮件", content=table_html.replace('@', content_email))
- count = 0
- else:
- send_simple_txt_email(to_list, title="站点编码转换错误", content="出发地:%s \t 目的地:%s" % (from_station, to_station))
- pass
- pass
- # 访问车票查询URL, x是替换参数, count用来3次计数
- def query_ticket(url_query, x, count):
- query_resp, query_resp_html = open_url(url_query)
- query_json = json.loads(query_resp_html)
- ++count
- if query_json['status']: # JSON状态status为True
- logging.error("查询车票JSON状态正确:%s" % query_json)
- if query_json['messages']: # 出现异常情况
- logging.error("查询车票出现messages信息:%s" % query_json['messages'])
- elif (not query_json['data']) or (len(query_json['data']) == 0):
- logging.info("查询车票出现data信息为空,可能没有直达车次")
- else:
- count = 0
- logging.info("查询车票信息结构正确")
- for data_l in query_json['data']:
- # type(data_l) = <class 'dict'>
- data_ll = data_l['queryLeftNewDTO']
- # 加到list中
- if data_ll['station_train_code'] in list_ticket_id:
- continue
- else:
- list_ticket_id.append(data_ll['station_train_code'])
- ticket = Ticket(data_ll['station_train_code'], (
- data_ll['from_station_name'] == data_ll['start_station_name'] and "是" or "否"),
- data_ll['from_station_name'], data_ll['start_time'],
- data_ll['to_station_name'], data_ll['arrive_time'],
- data_ll['lishi'], data_ll['control_train_day'],
- str(data_l['buttonTextInfo']).replace('<br/>', ''))
- list_ticket.append(ticket)
- # 查询成功,跳出循环
- count = 3
- else:
- logging.error("JSON返回结果错误,URL = %s \n \t JSON = %s" % (url_query, query_json))
- return count
- # html邮件
- def send_simple_txt_email(to_user_list=to_list, title='这是标题呀', content='这是内容呀'):
- pass
- msg = MIMEText(_text=content, _subtype='html', _charset='gb2312')
- msg['Subject'] = title
- msg['From'] = user_addr
- msg['To'] = ";".join(to_user_list)
- logging.info('开始发送邮件')
- try:
- server = smtplib.SMTP()
- server.connect(host=user_host)
- server.login(user_addr, user_password)
- server.sendmail(from_addr=user_addr, to_addrs=to_user_list, msg=msg.as_string())
- server.close()
- logging.info("发送邮件成功")
- return True
- except Exception as e:
- logging.error('发送邮件出现异常,请检查')
- logging.error(e)
- print(e)
- return False
- # 查询对应的站点对应编码,list格式
- def query_station(staion_name, file_content):
- list_content = file_content.split('@')
- list_result = []
- for content in list_content:
- if staion_name in content:
- list_result.append(content.split('|')[2])
- if list_result is None or len(list_result) == 0:
- logging.info("站点:%s 转换结果为空" % staion_name)
- else:
- logging.info("站点:%s 转换成功" % staion_name)
- return list_result
- # 返回resp和html编码
- def open_url(url):
- # 随机睡眠2-5秒
- time.sleep(random.randint(2, 5))
- # 关闭验证
- resp = session.get(url=url, headers=headers, verify=False)
- resp_html = resp.text.encode(resp.encoding).decode('utf-8')
- logging.info("访问url:%s" % url)
- return resp, resp_html
- def main():
- query(url_init)
- if __name__ == '__main__':
- main()
复制代码 |
|