|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
请问这样写算多线程吗,还有想问一下这个时间怎么打印不出来
- import threading
- import multiprocessing
- import time
- import requests
- import logging
- import re
- from urllib.parse import urljoin
- from os.path import exists
- from os import makedirs
- result_dir = 'tu'
- exists(result_dir) or makedirs(result_dir)
- logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')
- header = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
- }
- url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'
- pages = 3
- def scrpae_page(url):
- logging.info('scraping %s...', url)
- try:
- response = requests.get(url=url,headers=header)
- if response.status_code == 200:
- response.encoding = 'gbk'
- return response
- logging.error('get invalid status code %s while scrape %s',response.status_code,url)
- except requests.RequestException:
- logging.error('error occurred while scraping %s',url,exc_info=True)
- #拼接URl,并爬取主页
- def scrape_index(page):
- index_url = url.format(page)
- return scrpae_page(index_url)
- #解析详情页url
- def parse_index(html):
- #logging.info('{}'.format(html))
- url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)
- items = re.findall(url_pattern,html)
- title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)
- titles = re.findall(title_pattern,html)
- # logging.info('{}'.format(list(titles)))
- return {
- '名称':titles,
- 'detail_url':items
- }
- #拼接url,并调用scrpae_page
- def parse_url(path):
- detail_url = urljoin(url,path)
- return scrpae_page(detail_url).content
- #保存
- def save(title,conect):
- invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|'] # Invalid Windows characters
- for char in invalid_chars:
- title = title.replace(char, '')
- img_path = result_dir + '/' + title + '.jpg'
- with open(img_path,'wb') as fp:
- fp.write(conect)
- def main(page):
-
- index_html = scrape_index(page)
- titles = parse_index(index_html.text)['名称']
- details_url = parse_index(index_html.text)['detail_url']
- for title,url in zip(titles,details_url):
- conect = parse_url(url)
- save(title,conect)
- logging.info('保存成功')
-
- if __name__ == '__main__':
- start = time.time()
- for page in range(1,pages):
- t = threading.Thread(target=main,args=(page,))
- t.start()
- end = time.time()
- print('time : {}'.format(end-start))
复制代码
本帖最后由 isdkz 于 2023-5-30 18:50 编辑
这样写算多线程,时间不是没有打印,而是早就打印了,因为子线程在运行的同时主线程也会继续运行的
你可以使用 join 让主线程等待子线程运行结束
对你的代码修改如下(新增的内容在 86、92、93行):
- import threading
- import multiprocessing
- import time
- import requests
- import logging
- import re
- from urllib.parse import urljoin
- from os.path import exists
- from os import makedirs
- result_dir = 'tu'
- exists(result_dir) or makedirs(result_dir)
- logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')
- header = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
- }
- url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'
- pages = 3
- def scrpae_page(url):
- logging.info('scraping %s...', url)
- try:
- response = requests.get(url=url,headers=header)
- if response.status_code == 200:
- response.encoding = 'gbk'
- return response
- logging.error('get invalid status code %s while scrape %s',response.status_code,url)
- except requests.RequestException:
- logging.error('error occurred while scraping %s',url,exc_info=True)
- #拼接URl,并爬取主页
- def scrape_index(page):
- index_url = url.format(page)
- return scrpae_page(index_url)
- #解析详情页url
- def parse_index(html):
- #logging.info('{}'.format(html))
- url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)
- items = re.findall(url_pattern,html)
- title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)
- titles = re.findall(title_pattern,html)
- # logging.info('{}'.format(list(titles)))
- return {
- '名称':titles,
- 'detail_url':items
- }
- #拼接url,并调用scrpae_page
- def parse_url(path):
- detail_url = urljoin(url,path)
- return scrpae_page(detail_url).content
- #保存
- def save(title,conect):
- invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|'] # Invalid Windows characters
- for char in invalid_chars:
- title = title.replace(char, '')
- img_path = result_dir + '/' + title + '.jpg'
- with open(img_path,'wb') as fp:
- fp.write(conect)
- def main(page):
-
- index_html = scrape_index(page)
- titles = parse_index(index_html.text)['名称']
- details_url = parse_index(index_html.text)['detail_url']
- for title,url in zip(titles,details_url):
- conect = parse_url(url)
- save(title,conect)
- logging.info('保存成功')
-
- if __name__ == '__main__':
- start = time.time()
- tasks = []
- for page in range(1,pages):
- t = threading.Thread(target=main,args=(page,))
- tasks.append(t)
- t.start()
- for t in tasks:
- t.join()
- end = time.time()
- print('time : {}'.format(end-start))
复制代码
|
|