python线程,Python交流,编程语言专区,鱼C论坛

哈岁NB 发表于 2023-5-30 18:43:21

python线程

请问这样写算多线程吗，还有想问一下这个时间怎么打印不出来
import threading
import multiprocessing
import time

import requests
import logging
import re
from urllib.parse import urljoin
from os.path import exists
from os import makedirs

result_dir = 'tu'
exists(result_dir) or makedirs(result_dir)

logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')

header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'
pages = 3

def scrpae_page(url):
logging.info('scraping %s...', url)
try:
   response = requests.get(url=url,headers=header)
   if response.status_code == 200:
         response.encoding = 'gbk'
         return response
   logging.error('get invalid status code %s while scrape %s',response.status_code,url)
except requests.RequestException:
   logging.error('error occurred while scraping %s',url,exc_info=True)

#拼接URl，并爬取主页
def scrape_index(page):
index_url = url.format(page)
return scrpae_page(index_url)

#解析详情页url
def parse_index(html):
#logging.info('{}'.format(html))
url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)
items = re.findall(url_pattern,html)
title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)
titles = re.findall(title_pattern,html)
# logging.info('{}'.format(list(titles)))

return {
   '名称':titles,
   'detail_url':items
}

#拼接url，并调用scrpae_page
def parse_url(path):
detail_url = urljoin(url,path)
return scrpae_page(detail_url).content

#保存
def save(title,conect):
invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']# Invalid Windows characters
for char in invalid_chars:
   title = title.replace(char, '')
img_path = result_dir + '/' + title + '.jpg'
with open(img_path,'wb') as fp:
   fp.write(conect)

def main(page):

index_html = scrape_index(page)
titles = parse_index(index_html.text)['名称']
details_url = parse_index(index_html.text)['detail_url']
for title,url in zip(titles,details_url):
   conect = parse_url(url)
   save(title,conect)
   logging.info('保存成功')

if __name__ == '__main__':
start = time.time()
for page in range(1,pages):
   t = threading.Thread(target=main,args=(page,))
   t.start()
end = time.time()
print('time : {}'.format(end-start))

isdkz 发表于 2023-5-30 18:48:18

本帖最后由 isdkz 于 2023-5-30 18:50 编辑

这样写算多线程，时间不是没有打印，而是早就打印了，因为子线程在运行的同时主线程也会继续运行的

你可以使用 join 让主线程等待子线程运行结束

对你的代码修改如下（新增的内容在 86、92、93行）：

import threading
import multiprocessing
import time

import requests
import logging
import re
from urllib.parse import urljoin
from os.path import exists
from os import makedirs

result_dir = 'tu'
exists(result_dir) or makedirs(result_dir)

logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')

header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'
pages = 3

def scrpae_page(url):
logging.info('scraping %s...', url)
try:
   response = requests.get(url=url,headers=header)
   if response.status_code == 200:
         response.encoding = 'gbk'
         return response
   logging.error('get invalid status code %s while scrape %s',response.status_code,url)
except requests.RequestException:
   logging.error('error occurred while scraping %s',url,exc_info=True)

#拼接URl，并爬取主页
def scrape_index(page):
index_url = url.format(page)
return scrpae_page(index_url)

#解析详情页url
def parse_index(html):
#logging.info('{}'.format(html))
url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)
items = re.findall(url_pattern,html)
title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)
titles = re.findall(title_pattern,html)
# logging.info('{}'.format(list(titles)))

return {
   '名称':titles,
   'detail_url':items
}

#拼接url，并调用scrpae_page
def parse_url(path):
detail_url = urljoin(url,path)
return scrpae_page(detail_url).content

#保存
def save(title,conect):
invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']# Invalid Windows characters
for char in invalid_chars:
   title = title.replace(char, '')
img_path = result_dir + '/' + title + '.jpg'
with open(img_path,'wb') as fp:
   fp.write(conect)

def main(page):

index_html = scrape_index(page)
titles = parse_index(index_html.text)['名称']
details_url = parse_index(index_html.text)['detail_url']
for title,url in zip(titles,details_url):
   conect = parse_url(url)
   save(title,conect)
   logging.info('保存成功')

if __name__ == '__main__':
start = time.time()
tasks = []
for page in range(1,pages):
   t = threading.Thread(target=main,args=(page,))
   tasks.append(t)
   t.start()

for t in tasks:
   t.join()
end = time.time()
print('time : {}'.format(end-start))

哈岁NB 发表于 2023-5-30 18:58:56

isdkz 发表于 2023-5-30 18:48
这样写算多线程，时间不是没有打印，而是早就打印了，因为子线程在运行的同时主线程也会继续运行的

你可 ...

好的，感谢感谢

页: [1]

鱼C论坛's Archiver

python线程