鱼C论坛

 找回密码
 立即注册
查看: 1017|回复: 2

[已解决]python线程

[复制链接]
发表于 2023-5-30 18:43:21 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
请问这样写算多线程吗,还有想问一下这个时间怎么打印不出来
import threading
import multiprocessing
import time

import requests
import logging
import re
from urllib.parse import urljoin
from os.path import exists
from os import makedirs

result_dir = 'tu'
exists(result_dir) or makedirs(result_dir)


logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')

header = {
   "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'
pages = 3


def scrpae_page(url):
    logging.info('scraping %s...', url)
    try:
        response = requests.get(url=url,headers=header)
        if response.status_code == 200:
            response.encoding = 'gbk'
            return response
        logging.error('get invalid status code %s while scrape %s',response.status_code,url)
    except requests.RequestException:
        logging.error('error occurred while scraping %s',url,exc_info=True)

#拼接URl,并爬取主页
def scrape_index(page):
    index_url = url.format(page)
    return scrpae_page(index_url)

#解析详情页url
def parse_index(html):
    #logging.info('{}'.format(html))
    url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)
    items = re.findall(url_pattern,html)
    title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)
    titles = re.findall(title_pattern,html)
    # logging.info('{}'.format(list(titles)))

    return {
        '名称':titles,
        'detail_url':items
    }

#拼接url,并调用scrpae_page
def parse_url(path):
    detail_url = urljoin(url,path)
    return scrpae_page(detail_url).content

#保存
def save(title,conect):
    invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']  # Invalid Windows characters
    for char in invalid_chars:
        title = title.replace(char, '')
    img_path = result_dir + '/' + title + '.jpg'
    with open(img_path,'wb') as fp:
        fp.write(conect)



def main(page):
    
    index_html = scrape_index(page)
    titles = parse_index(index_html.text)['名称']
    details_url = parse_index(index_html.text)['detail_url']
    for title,url in zip(titles,details_url):
        conect = parse_url(url)
        save(title,conect)
        logging.info('保存成功')
    



if __name__ == '__main__':
    start = time.time()
    for page in range(1,pages):
        t = threading.Thread(target=main,args=(page,))
        t.start()
    end = time.time()
    print('time : {}'.format(end-start))
最佳答案
2023-5-30 18:48:18
本帖最后由 isdkz 于 2023-5-30 18:50 编辑

这样写算多线程,时间不是没有打印,而是早就打印了,因为子线程在运行的同时主线程也会继续运行的

你可以使用 join 让主线程等待子线程运行结束

对你的代码修改如下(新增的内容在 86、92、93行):
import threading
import multiprocessing
import time

import requests
import logging
import re
from urllib.parse import urljoin
from os.path import exists
from os import makedirs

result_dir = 'tu'
exists(result_dir) or makedirs(result_dir)


logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')

header = {
   "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'
pages = 3


def scrpae_page(url):
    logging.info('scraping %s...', url)
    try:
        response = requests.get(url=url,headers=header)
        if response.status_code == 200:
            response.encoding = 'gbk'
            return response
        logging.error('get invalid status code %s while scrape %s',response.status_code,url)
    except requests.RequestException:
        logging.error('error occurred while scraping %s',url,exc_info=True)

#拼接URl,并爬取主页
def scrape_index(page):
    index_url = url.format(page)
    return scrpae_page(index_url)

#解析详情页url
def parse_index(html):
    #logging.info('{}'.format(html))
    url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)
    items = re.findall(url_pattern,html)
    title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)
    titles = re.findall(title_pattern,html)
    # logging.info('{}'.format(list(titles)))

    return {
        '名称':titles,
        'detail_url':items
    }

#拼接url,并调用scrpae_page
def parse_url(path):
    detail_url = urljoin(url,path)
    return scrpae_page(detail_url).content

#保存
def save(title,conect):
    invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']  # Invalid Windows characters
    for char in invalid_chars:
        title = title.replace(char, '')
    img_path = result_dir + '/' + title + '.jpg'
    with open(img_path,'wb') as fp:
        fp.write(conect)



def main(page):
   
    index_html = scrape_index(page)
    titles = parse_index(index_html.text)['名称']
    details_url = parse_index(index_html.text)['detail_url']
    for title,url in zip(titles,details_url):
        conect = parse_url(url)
        save(title,conect)
        logging.info('保存成功')
   



if __name__ == '__main__':
    start = time.time()
    tasks = []
    for page in range(1,pages):
        t = threading.Thread(target=main,args=(page,))
        tasks.append(t)
        t.start()

    for t in tasks:
        t.join()
    end = time.time()
    print('time : {}'.format(end-start))

想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

发表于 2023-5-30 18:48:18 | 显示全部楼层    本楼为最佳答案   
本帖最后由 isdkz 于 2023-5-30 18:50 编辑

这样写算多线程,时间不是没有打印,而是早就打印了,因为子线程在运行的同时主线程也会继续运行的

你可以使用 join 让主线程等待子线程运行结束

对你的代码修改如下(新增的内容在 86、92、93行):
import threading
import multiprocessing
import time

import requests
import logging
import re
from urllib.parse import urljoin
from os.path import exists
from os import makedirs

result_dir = 'tu'
exists(result_dir) or makedirs(result_dir)


logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')

header = {
   "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
url = 'http://md.itlun.cn/a/nhtp/list_2_{}.html'
pages = 3


def scrpae_page(url):
    logging.info('scraping %s...', url)
    try:
        response = requests.get(url=url,headers=header)
        if response.status_code == 200:
            response.encoding = 'gbk'
            return response
        logging.error('get invalid status code %s while scrape %s',response.status_code,url)
    except requests.RequestException:
        logging.error('error occurred while scraping %s',url,exc_info=True)

#拼接URl,并爬取主页
def scrape_index(page):
    index_url = url.format(page)
    return scrpae_page(index_url)

#解析详情页url
def parse_index(html):
    #logging.info('{}'.format(html))
    url_pattern = re.compile('<script.*?src = "(.*?)"; </script>',re.S)
    items = re.findall(url_pattern,html)
    title_pattern = re.compile('<LI>.*?><IMG id="img.*?><span>(.*?)</span></a></LI>',re.S)
    titles = re.findall(title_pattern,html)
    # logging.info('{}'.format(list(titles)))

    return {
        '名称':titles,
        'detail_url':items
    }

#拼接url,并调用scrpae_page
def parse_url(path):
    detail_url = urljoin(url,path)
    return scrpae_page(detail_url).content

#保存
def save(title,conect):
    invalid_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']  # Invalid Windows characters
    for char in invalid_chars:
        title = title.replace(char, '')
    img_path = result_dir + '/' + title + '.jpg'
    with open(img_path,'wb') as fp:
        fp.write(conect)



def main(page):
   
    index_html = scrape_index(page)
    titles = parse_index(index_html.text)['名称']
    details_url = parse_index(index_html.text)['detail_url']
    for title,url in zip(titles,details_url):
        conect = parse_url(url)
        save(title,conect)
        logging.info('保存成功')
   



if __name__ == '__main__':
    start = time.time()
    tasks = []
    for page in range(1,pages):
        t = threading.Thread(target=main,args=(page,))
        tasks.append(t)
        t.start()

    for t in tasks:
        t.join()
    end = time.time()
    print('time : {}'.format(end-start))

想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

 楼主| 发表于 2023-5-30 18:58:56 | 显示全部楼层
isdkz 发表于 2023-5-30 18:48
这样写算多线程,时间不是没有打印,而是早就打印了,因为子线程在运行的同时主线程也会继续运行的

你可 ...

好的,感谢感谢
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-12-27 11:30

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表