[已解决]python线程

哈岁NB · 发表于 2023-5-30 18:43:21

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

请问这样写算多线程吗，还有想问一下这个时间怎么打印不出来

import threading
import multiprocessing
import time
import requests
import logging
import re
from urllib.parse import urljoin
from os.path import exists
from os import makedirs
result_dir = 'tu'
exists(result_dir) or makedirs(result_dir)
logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'
pages = 3
def scrpae_page(url):
logging.info('scraping %s...', url)
try:
response = requests.get(url=url,headers=header)
if response.status_code == 200:
response.encoding = 'gbk'
return response
logging.error('get invalid status code %s while scrape %s',response.status_code,url)
except requests.RequestException:
logging.error('error occurred while scraping %s',url,exc_info=True)
#拼接URl，并爬取主页
def scrape_index(page):
index_url = url.format(page)
return scrpae_page(index_url)
#解析详情页url
def parse_index(html):
#logging.info('{}'.format(html))
url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)
items = re.findall(url_pattern,html)
title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)
titles = re.findall(title_pattern,html)
# logging.info('{}'.format(list(titles)))
return {
'名称':titles,
'detail_url':items
}
#拼接url，并调用scrpae_page
def parse_url(path):
detail_url = urljoin(url,path)
return scrpae_page(detail_url).content
#保存
def save(title,conect):
invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|'] # Invalid Windows characters
for char in invalid_chars:
title = title.replace(char, '')
img_path = result_dir + '/' + title + '.jpg'
with open(img_path,'wb') as fp:
fp.write(conect)
def main(page):
index_html = scrape_index(page)
titles = parse_index(index_html.text)['名称']
details_url = parse_index(index_html.text)['detail_url']
for title,url in zip(titles,details_url):
conect = parse_url(url)
save(title,conect)
logging.info('保存成功')
if __name__ == '__main__':
start = time.time()
for page in range(1,pages):
t = threading.Thread(target=main,args=(page,))
t.start()
end = time.time()
print('time : {}'.format(end-start))

复制代码

最佳答案

月排行榜 / 总排行榜

isdkz

2023-5-30 18:48:18

本帖最后由 isdkz 于 2023-5-30 18:50 编辑

这样写算多线程，时间不是没有打印，而是早就打印了，因为子线程在运行的同时主线程也会继续运行的

你可以使用 join 让主线程等待子线程运行结束

对你的代码修改如下（新增的内容在 86、92、93行）：

import threading

import multiprocessing

import time

import requests

import logging

import re

from urllib.parse import urljoin

from os.path import exists

from os import makedirs

result_dir = 'tu'

exists(result_dir) or makedirs(result_dir)

logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')

header = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"

}

url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'

pages = 3

def scrpae_page(url):

logging.info('scraping %s...', url)

try:

      response = requests.get(url=url,headers=header)

      if response.status_code == 200:

         response.encoding = 'gbk'

         return response

      logging.error('get invalid status code %s while scrape %s',response.status_code,url)

except requests.RequestException:

      logging.error('error occurred while scraping %s',url,exc_info=True)

#拼接URl，并爬取主页

def scrape_index(page):

index_url = url.format(page)

return scrpae_page(index_url)

#解析详情页url

def parse_index(html):

#logging.info('{}'.format(html))

url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)

items = re.findall(url_pattern,html)

title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)

titles = re.findall(title_pattern,html)

# logging.info('{}'.format(list(titles)))

return {

      '名称':titles,

      'detail_url':items

}

#拼接url，并调用scrpae_page

def parse_url(path):

detail_url = urljoin(url,path)

return scrpae_page(detail_url).content

#保存

def save(title,conect):

invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']  # Invalid Windows characters

for char in invalid_chars:

      title = title.replace(char, '')

img_path = result_dir + '/' + title + '.jpg'

with open(img_path,'wb') as fp:

      fp.write(conect)

def main(page):

index_html = scrape_index(page)

titles = parse_index(index_html.text)['名称']

details_url = parse_index(index_html.text)['detail_url']

for title,url in zip(titles,details_url):

      conect = parse_url(url)

      save(title,conect)

      logging.info('保存成功')

if __name__ == '__main__':

start = time.time()

tasks = []

for page in range(1,pages):

      t = threading.Thread(target=main,args=(page,))

      tasks.append(t)

      t.start()

for t in tasks:

      t.join()

end = time.time()

print('time : {}'.format(end-start))

复制代码

跳转到最佳答案楼层

isdkz · 发表于 2023-5-30 18:48:18

本帖最后由 isdkz 于 2023-5-30 18:50 编辑

这样写算多线程，时间不是没有打印，而是早就打印了，因为子线程在运行的同时主线程也会继续运行的

你可以使用 join 让主线程等待子线程运行结束

对你的代码修改如下（新增的内容在 86、92、93行）：

import threading

import multiprocessing

import time

import requests

import logging

import re

from urllib.parse import urljoin

from os.path import exists

from os import makedirs

result_dir = 'tu'

exists(result_dir) or makedirs(result_dir)

logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')

header = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"

}

url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'

pages = 3

def scrpae_page(url):

logging.info('scraping %s...', url)

try:

      response = requests.get(url=url,headers=header)

      if response.status_code == 200:

         response.encoding = 'gbk'

         return response

      logging.error('get invalid status code %s while scrape %s',response.status_code,url)

except requests.RequestException:

      logging.error('error occurred while scraping %s',url,exc_info=True)

#拼接URl，并爬取主页

def scrape_index(page):

index_url = url.format(page)

return scrpae_page(index_url)

#解析详情页url

def parse_index(html):

#logging.info('{}'.format(html))

url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)

items = re.findall(url_pattern,html)

title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)

titles = re.findall(title_pattern,html)

# logging.info('{}'.format(list(titles)))

return {

      '名称':titles,

      'detail_url':items

}

#拼接url，并调用scrpae_page

def parse_url(path):

detail_url = urljoin(url,path)

return scrpae_page(detail_url).content

#保存

def save(title,conect):

invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']  # Invalid Windows characters

for char in invalid_chars:

      title = title.replace(char, '')

img_path = result_dir + '/' + title + '.jpg'

with open(img_path,'wb') as fp:

      fp.write(conect)

def main(page):

index_html = scrape_index(page)

titles = parse_index(index_html.text)['名称']

details_url = parse_index(index_html.text)['detail_url']

for title,url in zip(titles,details_url):

      conect = parse_url(url)

      save(title,conect)

      logging.info('保存成功')

if __name__ == '__main__':

start = time.time()

tasks = []

for page in range(1,pages):

      t = threading.Thread(target=main,args=(page,))

      tasks.append(t)

      t.start()

for t in tasks:

      t.join()

end = time.time()

print('time : {}'.format(end-start))

复制代码

哈岁NB · 发表于 2023-5-30 18:58:56

isdkz 发表于 2023-5-30 18:48
这样写算多线程，时间不是没有打印，而是早就打印了，因为子线程在运行的同时主线程也会继续运行的

你可 ...

好的，感谢感谢

账号		自动登录	找回密码
密码			立即注册

[已解决]python线程

马上注册，结交更多好友，享用更多功能^_^

浏览过的版块