怎么提取这个页面的所有图片啊?
源代码为:import requests
from bs4 import BeautifulSoup
from lxml import etree
import csv
def download_one_page(url):
#拿到页面源代码
resp = requests.get(url)
resp.encoding = 'utf-8'#处理乱码
#爬取图片,把源代码交给beautifulsoup
pictures = BeautifulSoup(resp.text,"html.parser")
alist = pictures.find_all("img",width="205")#把范围第一次缩小
print(len(alist))
for a in alist:
#拿到图片地址
img_href = a.get('href')
#print(img_href)
#下载图片
img_resp = requests.get(img_src)
path = "D://Picture//"
img_name = img_src.split("/")[-1]
with open(path+img_name,mode='wb') as P:
P.write(img_resp.content)
print("over")
if __name__ == '__main__':
download_one_page('http://www.c-denkei.cn/index.php?d=home&c=goods&m=search&s=%E7%94%B5%E6%BA%90&c1=0&c2=0&c3=0&page=')
用find的话,只能提取到第一张图片,直接用find_all的话又会报错,好难受啊 6666666666666666 {:10_256:}{:10_243:} 对你的代码修改如下:
import requests
from bs4 import BeautifulSoup
from lxml import etree
import csv
import os
def download_one_page(url):
#拿到页面源代码
resp = requests.get(url)
resp.encoding = 'utf-8'#处理乱码
#爬取图片,把源代码交给beautifulsoup
pictures = BeautifulSoup(resp.text,"html.parser")
alist = pictures.find_all("img",width="205")#把范围第一次缩小
print(len(alist))
for a in alist:
#拿到图片地址
img_src = a.get('src') # 改了这一行,img 标签的地址放在 src 属性中
#print(img_href)
#下载图片
img_resp = requests.get(img_src)
path = "D:/Picture/" # 斜杠不需要两条也可以
if not os.path.exists(path): # 判断文件夹是否存在,不存在则新建
os.mkdir(path)
img_name = img_src.split("/")[-1]
with open(path+img_name,mode='wb') as P:
P.write(img_resp.content)
print("over")
if __name__ == '__main__':
download_one_page('http://www.c-denkei.cn/index.php?d=home&c=goods&m=search&s=%E7%94%B5%E6%BA%90&c1=0&c2=0&c3=0&page=') 4楼正确 这要学到哪里才会啊! 可以使用xpath 的方式类提取网页中需要的元素还好操作
页:
[1]