鱼C论坛

 找回密码
 立即注册
查看: 655|回复: 2

[已解决]python线程

[复制链接]
发表于 2023-5-30 18:43:21 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
请问这样写算多线程吗,还有想问一下这个时间怎么打印不出来
  1. import threading
  2. import multiprocessing
  3. import time

  4. import requests
  5. import logging
  6. import re
  7. from urllib.parse import urljoin
  8. from os.path import exists
  9. from os import makedirs

  10. result_dir = 'tu'
  11. exists(result_dir) or makedirs(result_dir)


  12. logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')

  13. header = {
  14.    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
  15. }
  16. url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'
  17. pages = 3


  18. def scrpae_page(url):
  19.     logging.info('scraping %s...', url)
  20.     try:
  21.         response = requests.get(url=url,headers=header)
  22.         if response.status_code == 200:
  23.             response.encoding = 'gbk'
  24.             return response
  25.         logging.error('get invalid status code %s while scrape %s',response.status_code,url)
  26.     except requests.RequestException:
  27.         logging.error('error occurred while scraping %s',url,exc_info=True)

  28. #拼接URl,并爬取主页
  29. def scrape_index(page):
  30.     index_url = url.format(page)
  31.     return scrpae_page(index_url)

  32. #解析详情页url
  33. def parse_index(html):
  34.     #logging.info('{}'.format(html))
  35.     url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)
  36.     items = re.findall(url_pattern,html)
  37.     title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)
  38.     titles = re.findall(title_pattern,html)
  39.     # logging.info('{}'.format(list(titles)))

  40.     return {
  41.         '名称':titles,
  42.         'detail_url':items
  43.     }

  44. #拼接url,并调用scrpae_page
  45. def parse_url(path):
  46.     detail_url = urljoin(url,path)
  47.     return scrpae_page(detail_url).content

  48. #保存
  49. def save(title,conect):
  50.     invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']  # Invalid Windows characters
  51.     for char in invalid_chars:
  52.         title = title.replace(char, '')
  53.     img_path = result_dir + '/' + title + '.jpg'
  54.     with open(img_path,'wb') as fp:
  55.         fp.write(conect)



  56. def main(page):
  57.    
  58.     index_html = scrape_index(page)
  59.     titles = parse_index(index_html.text)['名称']
  60.     details_url = parse_index(index_html.text)['detail_url']
  61.     for title,url in zip(titles,details_url):
  62.         conect = parse_url(url)
  63.         save(title,conect)
  64.         logging.info('保存成功')
  65.    



  66. if __name__ == '__main__':
  67.     start = time.time()
  68.     for page in range(1,pages):
  69.         t = threading.Thread(target=main,args=(page,))
  70.         t.start()
  71.     end = time.time()
  72.     print('time : {}'.format(end-start))
复制代码
最佳答案
2023-5-30 18:48:18
本帖最后由 isdkz 于 2023-5-30 18:50 编辑

这样写算多线程,时间不是没有打印,而是早就打印了,因为子线程在运行的同时主线程也会继续运行的

你可以使用 join 让主线程等待子线程运行结束

对你的代码修改如下(新增的内容在 86、92、93行):

  1. import threading
  2. import multiprocessing
  3. import time

  4. import requests
  5. import logging
  6. import re
  7. from urllib.parse import urljoin
  8. from os.path import exists
  9. from os import makedirs

  10. result_dir = 'tu'
  11. exists(result_dir) or makedirs(result_dir)


  12. logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')

  13. header = {
  14.    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
  15. }
  16. url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'
  17. pages = 3


  18. def scrpae_page(url):
  19.     logging.info('scraping %s...', url)
  20.     try:
  21.         response = requests.get(url=url,headers=header)
  22.         if response.status_code == 200:
  23.             response.encoding = 'gbk'
  24.             return response
  25.         logging.error('get invalid status code %s while scrape %s',response.status_code,url)
  26.     except requests.RequestException:
  27.         logging.error('error occurred while scraping %s',url,exc_info=True)

  28. #拼接URl,并爬取主页
  29. def scrape_index(page):
  30.     index_url = url.format(page)
  31.     return scrpae_page(index_url)

  32. #解析详情页url
  33. def parse_index(html):
  34.     #logging.info('{}'.format(html))
  35.     url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)
  36.     items = re.findall(url_pattern,html)
  37.     title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)
  38.     titles = re.findall(title_pattern,html)
  39.     # logging.info('{}'.format(list(titles)))

  40.     return {
  41.         '名称':titles,
  42.         'detail_url':items
  43.     }

  44. #拼接url,并调用scrpae_page
  45. def parse_url(path):
  46.     detail_url = urljoin(url,path)
  47.     return scrpae_page(detail_url).content

  48. #保存
  49. def save(title,conect):
  50.     invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']  # Invalid Windows characters
  51.     for char in invalid_chars:
  52.         title = title.replace(char, '')
  53.     img_path = result_dir + '/' + title + '.jpg'
  54.     with open(img_path,'wb') as fp:
  55.         fp.write(conect)



  56. def main(page):
  57.    
  58.     index_html = scrape_index(page)
  59.     titles = parse_index(index_html.text)['名称']
  60.     details_url = parse_index(index_html.text)['detail_url']
  61.     for title,url in zip(titles,details_url):
  62.         conect = parse_url(url)
  63.         save(title,conect)
  64.         logging.info('保存成功')
  65.    



  66. if __name__ == '__main__':
  67.     start = time.time()
  68.     tasks = []
  69.     for page in range(1,pages):
  70.         t = threading.Thread(target=main,args=(page,))
  71.         tasks.append(t)
  72.         t.start()

  73.     for t in tasks:
  74.         t.join()
  75.     end = time.time()
  76.     print('time : {}'.format(end-start))
复制代码


想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

发表于 2023-5-30 18:48:18 | 显示全部楼层    本楼为最佳答案   
本帖最后由 isdkz 于 2023-5-30 18:50 编辑

这样写算多线程,时间不是没有打印,而是早就打印了,因为子线程在运行的同时主线程也会继续运行的

你可以使用 join 让主线程等待子线程运行结束

对你的代码修改如下(新增的内容在 86、92、93行):

  1. import threading
  2. import multiprocessing
  3. import time

  4. import requests
  5. import logging
  6. import re
  7. from urllib.parse import urljoin
  8. from os.path import exists
  9. from os import makedirs

  10. result_dir = 'tu'
  11. exists(result_dir) or makedirs(result_dir)


  12. logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')

  13. header = {
  14.    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
  15. }
  16. url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'
  17. pages = 3


  18. def scrpae_page(url):
  19.     logging.info('scraping %s...', url)
  20.     try:
  21.         response = requests.get(url=url,headers=header)
  22.         if response.status_code == 200:
  23.             response.encoding = 'gbk'
  24.             return response
  25.         logging.error('get invalid status code %s while scrape %s',response.status_code,url)
  26.     except requests.RequestException:
  27.         logging.error('error occurred while scraping %s',url,exc_info=True)

  28. #拼接URl,并爬取主页
  29. def scrape_index(page):
  30.     index_url = url.format(page)
  31.     return scrpae_page(index_url)

  32. #解析详情页url
  33. def parse_index(html):
  34.     #logging.info('{}'.format(html))
  35.     url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)
  36.     items = re.findall(url_pattern,html)
  37.     title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)
  38.     titles = re.findall(title_pattern,html)
  39.     # logging.info('{}'.format(list(titles)))

  40.     return {
  41.         '名称':titles,
  42.         'detail_url':items
  43.     }

  44. #拼接url,并调用scrpae_page
  45. def parse_url(path):
  46.     detail_url = urljoin(url,path)
  47.     return scrpae_page(detail_url).content

  48. #保存
  49. def save(title,conect):
  50.     invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']  # Invalid Windows characters
  51.     for char in invalid_chars:
  52.         title = title.replace(char, '')
  53.     img_path = result_dir + '/' + title + '.jpg'
  54.     with open(img_path,'wb') as fp:
  55.         fp.write(conect)



  56. def main(page):
  57.    
  58.     index_html = scrape_index(page)
  59.     titles = parse_index(index_html.text)['名称']
  60.     details_url = parse_index(index_html.text)['detail_url']
  61.     for title,url in zip(titles,details_url):
  62.         conect = parse_url(url)
  63.         save(title,conect)
  64.         logging.info('保存成功')
  65.    



  66. if __name__ == '__main__':
  67.     start = time.time()
  68.     tasks = []
  69.     for page in range(1,pages):
  70.         t = threading.Thread(target=main,args=(page,))
  71.         tasks.append(t)
  72.         t.start()

  73.     for t in tasks:
  74.         t.join()
  75.     end = time.time()
  76.     print('time : {}'.format(end-start))
复制代码


想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

 楼主| 发表于 2023-5-30 18:58:56 | 显示全部楼层
isdkz 发表于 2023-5-30 18:48
这样写算多线程,时间不是没有打印,而是早就打印了,因为子线程在运行的同时主线程也会继续运行的

你可 ...

好的,感谢感谢
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-3-28 19:26

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表