set去重失败

ycgzs98789 · 发表于 2016-11-27 20:09:54

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

本帖最后由 ycgzs98789 于 2016-11-27 20:14 编辑

写这个程序的目的是为了监控12306什么时候卖票，好抢票回家

但是发现查询出来的结果有好多重复的车次信息，使用list转set再转回来还是有重复信息，求各位帮助

list定义在97行，转set在136行，list增加数据在170行

import requests
import re
import itertools
import json
import smtplib
from email.mime.text import MIMEText
import logging
import time
import random
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='./'+time.strftime('%Y-%m-%d', time.localtime())+'test.log',
filemode='w')
url_base = "https://kyfw.12306.cn"
url_init = "https://kyfw.12306.cn/otn/leftTicket/init"
url_query_base = "https://kyfw.12306.cn/otn/leftTicket/queryX?" \
"leftTicketDTO.train_date=%s" \
"&leftTicketDTO.from_station=%s" \
"&leftTicketDTO.to_station=%s" \
"&purpose_codes=ADULT"
table_html = '''<table align="center">
<tbody align="center">
<tr>
<th>车次</th>
<th>是否始发站发车</th>
<th>起始站</th>
<th>发车时间</th>
<th>抵达</th>
<th>到达时间</th>
<th>历时</th>
<th>当前最晚售票日期</th>
<th>售票按钮文字</th>
</tr>
@
</tbody>
</table>
'''
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'kyfw.12306.cn',
'If-Modified-Since': '0',
'Referer': 'https://kyfw.12306.cn/otn/leftTicket/init',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'}
#站点编码文件
file_station_name = "station_name.txt"
from_station = "杭州"
to_station = "三门峡"
train_date = '2017-01-22'
user_addr = '*****@163.com'
user_password = '*****'
user_host = 'smtp.163.com'
to_list = ['*****@qq.com']
session = requests.Session()
class Ticket(object):
__slots__ = ('station_train_code', 'is_start_station', 'from_station_name', 'start_time', 'to_station_name', 'arrive_time', 'lishi', 'control_train_day', 'buttonTextInfo') # 用tuple定义允许绑定的属性名称
def __init__(self, station_train_code, is_start_station, from_station_name, start_time,
to_station_name, arrive_time, lishi, control_train_day, buttonTextInfo):
self.station_train_code = station_train_code
self.is_start_station = is_start_station
self.from_station_name = from_station_name
self.start_time = start_time
self.to_station_name = to_station_name
self.arrive_time = arrive_time
self.lishi = lishi
self.control_train_day = control_train_day
self.buttonTextInfo = buttonTextInfo
def to_str(self):
return "<tr><td>" + self.station_train_code\
+ "</td><td>" + self.is_start_station \
+ "</td><td>" + self.from_station_name \
+ "</td><td>" + self.start_time \
+ "</td><td>" + self.to_station_name \
+ "</td><td>" + self.arrive_time \
+ "</td><td>" + self.lishi\
+ "</td><td>" + self.control_train_day \
+ "</td><td>" + self.buttonTextInfo + "</td></th>"
list_ticket = []
def query(url):
resp, resp_html = open_url(url)
pattern = re.compile(r'<script.{,200}<\/script>', re.I | re.M)
lists = pattern.findall(resp_html)
# station_name是车站名与编号的对应文件
url_station_name = url_base + lists[-4].split('"')[3]
resp, content_station_name = open_url(url_station_name)
content_station_name = content_station_name[20:-2]
if resp.status_code is 200:
with open(file_station_name, 'w', encoding='UTF-8') as f:
f.truncate()
f.write(content_station_name)
logging.info('修改%s文件' % file_station_name)
else:
logging.error('获取%s文件失败,status_code = %s' % (file_station_name, resp.status_code))
list_from_station = query_station(staion_name=from_station, file_content=content_station_name)
list_to_station = query_station(staion_name=to_station, file_content=content_station_name)
if (len(list_from_station) > 0) and (len(list_to_station) > 0):
# 计数，有时候会出现请求错误、或没有相应直达车次，连续失败3次后跳过
global count
count = 0
# 笛卡尔积
for x in itertools.product(list_from_station, list_to_station):
url_query = url_query_base % (train_date, x[0], x[1])
# 不等于0说明失败
while count < 3:
count = query_ticket(url_query, x, count)
count = 0
continue
content_email = ''
for ticket in list(set(list_ticket)): # 就是这里错了，转换后还是有重复数据
content_email += ticket.to_str()
send_simple_txt_email(to_list,title="每日12306监控邮件", content=table_html.replace('@', content_email))
count = 0
else:
send_simple_txt_email(to_list,title="站点编码转换错误", content="出发地：%s \t 目的地：%s" % (from_station, to_station))
pass
pass
# 访问车票查询URL， x是替换参数， count用来3次计数
def query_ticket(url_query, x, count):
query_resp, query_resp_html = open_url(url_query)
query_json = json.loads(query_resp_html)
++count
if query_json['status']: # JSON状态status为True
logging.error("查询车票JSON状态正确:%s" % query_json)
if query_json['messages']: # 出现异常情况
logging.error("查询车票出现messages信息:%s" % query_json['messages'])
elif (not query_json['data']) or (len(query_json['data']) == 0):
logging.info("查询车票出现data信息为空，可能没有直达车次")
else:
count = 0
logging.info("查询车票信息结构正确")
for data_l in query_json['data']:
# type(data_l) = <class 'dict'>
data_ll = data_l['queryLeftNewDTO']
#加到list中
list_ticket.append(Ticket(data_ll['station_train_code'], (data_ll['from_station_name'] == data_ll['start_station_name'] and "是" or "否"),
data_ll['from_station_name'], data_ll['start_time'], data_ll['to_station_name'], data_ll['arrive_time'],
data_ll['lishi'], data_ll['control_train_day'], str(data_l['buttonTextInfo']).replace('<br/>', '')))
# 查询成功,跳出循环
count = 3
else:
logging.error("JSON返回结果错误,URL = %s \n \t JSON = %s" % (url_query, query_json))
return count
# html邮件
def send_simple_txt_email(to_user_list=to_list, title='这是标题呀', content='这是内容呀'):
pass
msg = MIMEText(_text=content, _subtype='html', _charset='gb2312')
msg['Subject'] = title
msg['From'] = user_addr
msg['To'] = ";".join(to_user_list)
logging.info('开始发送邮件')
try:
server = smtplib.SMTP()
server.connect(host=user_host)
server.login(user_addr, user_password)
server.sendmail(from_addr=user_addr, to_addrs=to_user_list, msg=msg.as_string())
server.close()
logging.info("发送邮件成功")
return True
except Exception as e:
logging.error('发送邮件出现异常，请检查')
logging.error(e)
print(e)
return False
# 查询对应的站点对应编码,list格式
def query_station(staion_name, file_content):
list_content = file_content.split('@')
list_result = []
for content in list_content:
if staion_name in content:
list_result.append(content.split('|')[2])
if list_result is None or len(list_result) == 0:
logging.info("站点：%s 转换结果为空" % staion_name)
else:
logging.info("站点：%s 转换成功" % staion_name)
return list_result
# 返回resp和html编码
def open_url(url):
#随机睡眠2-5秒
time.sleep(random.randint(2, 5))
# 关闭验证
resp = session.get(url=url, headers=headers, verify=False)
resp_html = resp.text.encode(resp.encoding).decode('utf-8')
logging.info("访问url：%s" % url)
return resp, resp_html
def main():
query(url_init)
if __name__ == '__main__':
main()

复制代码

Bladem · 发表于 2016-11-27 23:42:35

大神

ycgzs98789 · 发表于 2016-11-28 08:46:53

Bladem 发表于 2016-11-27 23:42
大神

python学了1个半月，再懒一点，看看有什么东西是程序可以代替的，也就能到我这样了

ycgzs98789 · 发表于 2016-11-28 09:53:53

自己解决
去重失败原因：每次我在往list中添加数据的时候，都是新建的类对象，id不一致
因此增加一个list_ticket_id对象，用来以str形式存储车次号，如果已存在，说明有重复数据

不过我担心以后如果改成查询多个站点到多个站点，就会出现缺失数据的现象，难道只能改成车次+发车时间+到站时间吗？有没有其他方法？

另外，因为我在list和set中添加类对象时去重失败，有两个问题需要问一下

：
1.set去重是根据什么去的？id吗?
2.if a in list: in也是根据id判断的吗？

完整修复代码如下

import requests
import re
import itertools
import json
import smtplib
from email.mime.text import MIMEText
import logging
import time
import random
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='./' + time.strftime('%Y-%m-%d', time.localtime()) + 'test.log',
filemode='w')
url_base = "https://kyfw.12306.cn"
url_init = "https://kyfw.12306.cn/otn/leftTicket/init"
url_query_base = "https://kyfw.12306.cn/otn/leftTicket/queryX?" \
"leftTicketDTO.train_date=%s" \
"&leftTicketDTO.from_station=%s" \
"&leftTicketDTO.to_station=%s" \
"&purpose_codes=ADULT"
table_html = '''<table align="center">
<tbody align="center">
<tr>
<th>车次</th>
<th>是否始发站发车</th>
<th>起始站</th>
<th>发车时间</th>
<th>抵达</th>
<th>到达时间</th>
<th>历时</th>
<th>当前最晚售票日期</th>
<th>售票按钮文字</th>
</tr>
@
</tbody>
</table>
'''
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'kyfw.12306.cn',
'If-Modified-Since': '0',
'Referer': 'https://kyfw.12306.cn/otn/leftTicket/init',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'}
# 站点编码文件
file_station_name = "station_name.txt"
from_station = "杭州"
to_station = "三门峡"
train_date = '2017-01-22'
user_addr = '*****@163.com'
user_password = '*****'
user_host = 'smtp.163.com'
to_list = ['*****@qq.com']
session = requests.Session()
list_ticket = []
list_ticket_id = []
class Ticket(object):
__slots__ = (
'station_train_code', 'is_start_station', 'from_station_name', 'start_time', 'to_station_name', 'arrive_time',
'lishi', 'control_train_day', 'buttonTextInfo') # 用tuple定义允许绑定的属性名称
def __init__(self, station_train_code, is_start_station, from_station_name, start_time,
to_station_name, arrive_time, lishi, control_train_day, buttonTextInfo):
self.station_train_code = station_train_code
self.is_start_station = is_start_station
self.from_station_name = from_station_name
self.start_time = start_time
self.to_station_name = to_station_name
self.arrive_time = arrive_time
self.lishi = lishi
self.control_train_day = control_train_day
self.buttonTextInfo = buttonTextInfo
def to_str(self):
return "<tr><td>" + self.station_train_code \
+ "</td><td>" + self.is_start_station \
+ "</td><td>" + self.from_station_name \
+ "</td><td>" + self.start_time \
+ "</td><td>" + self.to_station_name \
+ "</td><td>" + self.arrive_time \
+ "</td><td>" + self.lishi \
+ "</td><td>" + self.control_train_day \
+ "</td><td>" + self.buttonTextInfo + "</td></tr>"
def query(url):
resp, resp_html = open_url(url)
pattern = re.compile(r'<script.{,200}<\/script>', re.I | re.M)
lists = pattern.findall(resp_html)
# station_name是车站名与编号的对应文件
url_station_name = url_base + lists[-4].split('"')[3]
resp, content_station_name = open_url(url_station_name)
content_station_name = content_station_name[20:-2]
if resp.status_code is 200:
with open(file_station_name, 'w', encoding='UTF-8') as f:
f.truncate()
f.write(content_station_name)
logging.info('修改%s文件' % file_station_name)
else:
logging.error('获取%s文件失败,status_code = %s' % (file_station_name, resp.status_code))
list_from_station = query_station(staion_name=from_station, file_content=content_station_name)
list_to_station = query_station(staion_name=to_station, file_content=content_station_name)
if (len(list_from_station) > 0) and (len(list_to_station) > 0):
# 计数，有时候会出现请求错误、或没有相应直达车次，连续失败3次后跳过
global count
count = 0
# 笛卡尔积
for x in itertools.product(list_from_station, list_to_station):
url_query = url_query_base % (train_date, x[0], x[1])
# 不等于0说明失败
while count < 3:
count = query_ticket(url_query, x, count)
count = 0
continue
content_email = ''
for ticket in list(set(list_ticket)): # 就是这里错了，转换后还是有重复数据
content_email += ticket.to_str()
send_simple_txt_email(to_list, title="每日12306监控邮件", content=table_html.replace('@', content_email))
count = 0
else:
send_simple_txt_email(to_list, title="站点编码转换错误", content="出发地：%s \t 目的地：%s" % (from_station, to_station))
pass
pass
# 访问车票查询URL， x是替换参数， count用来3次计数
def query_ticket(url_query, x, count):
query_resp, query_resp_html = open_url(url_query)
query_json = json.loads(query_resp_html)
++count
if query_json['status']: # JSON状态status为True
logging.error("查询车票JSON状态正确:%s" % query_json)
if query_json['messages']: # 出现异常情况
logging.error("查询车票出现messages信息:%s" % query_json['messages'])
elif (not query_json['data']) or (len(query_json['data']) == 0):
logging.info("查询车票出现data信息为空，可能没有直达车次")
else:
count = 0
logging.info("查询车票信息结构正确")
for data_l in query_json['data']:
# type(data_l) = <class 'dict'>
data_ll = data_l['queryLeftNewDTO']
# 加到list中
if data_ll['station_train_code'] in list_ticket_id:
continue
else:
list_ticket_id.append(data_ll['station_train_code'])
ticket = Ticket(data_ll['station_train_code'], (
data_ll['from_station_name'] == data_ll['start_station_name'] and "是" or "否"),
data_ll['from_station_name'], data_ll['start_time'],
data_ll['to_station_name'], data_ll['arrive_time'],
data_ll['lishi'], data_ll['control_train_day'],
str(data_l['buttonTextInfo']).replace('<br/>', ''))
list_ticket.append(ticket)
# 查询成功,跳出循环
count = 3
else:
logging.error("JSON返回结果错误,URL = %s \n \t JSON = %s" % (url_query, query_json))
return count
# html邮件
def send_simple_txt_email(to_user_list=to_list, title='这是标题呀', content='这是内容呀'):
pass
msg = MIMEText(_text=content, _subtype='html', _charset='gb2312')
msg['Subject'] = title
msg['From'] = user_addr
msg['To'] = ";".join(to_user_list)
logging.info('开始发送邮件')
try:
server = smtplib.SMTP()
server.connect(host=user_host)
server.login(user_addr, user_password)
server.sendmail(from_addr=user_addr, to_addrs=to_user_list, msg=msg.as_string())
server.close()
logging.info("发送邮件成功")
return True
except Exception as e:
logging.error('发送邮件出现异常，请检查')
logging.error(e)
print(e)
return False
# 查询对应的站点对应编码,list格式
def query_station(staion_name, file_content):
list_content = file_content.split('@')
list_result = []
for content in list_content:
if staion_name in content:
list_result.append(content.split('|')[2])
if list_result is None or len(list_result) == 0:
logging.info("站点：%s 转换结果为空" % staion_name)
else:
logging.info("站点：%s 转换成功" % staion_name)
return list_result
# 返回resp和html编码
def open_url(url):
# 随机睡眠2-5秒
time.sleep(random.randint(2, 5))
# 关闭验证
resp = session.get(url=url, headers=headers, verify=False)
resp_html = resp.text.encode(resp.encoding).decode('utf-8')
logging.info("访问url：%s" % url)
return resp, resp_html
def main():
query(url_init)
if __name__ == '__main__':
main()

复制代码

leo8080 · 发表于 2016-11-28 10:29:56

好贴 mark一下

ycgzs98789 · 发表于 2016-11-28 14:07:13

我去重开一贴，现在感觉问题和标题关系不大了

账号		自动登录	找回密码
密码			立即注册