def open_url(url):
req = urllib.request.Request(url)
response = urllib.request.urlopen(url)
html = response.read()
return html
def get_maxpage(url):
html = open_url(url).decode('utf-8')
a = html.find('.htm">末页')
b = html.find('a href=', a-40,a)
pages=int(html[b+14:a])
return pages
def get_url(url):
html = open_url(url).decode('utf-8')
url_address = []
a = html.find('a href= /News')
while a != -1:
b = html.find("target",a,a+50)
if b != -1:
url_address.append(html[a+8:b-5])
else:
b = a + 8
a = html.find('a href= /News',b)
return url_address
def download_url():
url = "http://www.tj2zy.com/Class/sxbzxrmd/index.htm"
front = "http://www.tj2zy.com"
back = ".htm"
page_num = get_url(url)
pages = int(get_maxpage(url))
conn = pymssql.connect(host='192.168.0.123', user='sa', password='glad2015', database='test',charset="utf8")
curson = conn.cursor()
conn.commit()
for page in page_num:
page_url = front + page + back
curson.executemany('insert into test1(nr_url) values(%s)', (page_url))
print(page_url)
curson.close()
conn.close()
if __name__ == '__main__':
download_url()