|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import urllib.parse
import re
import urllib.request
from bs4 import BeautifulSoup
def main():
baseurl='https://movie.douban.com/top250'
get(baseurl)
def main():...
findlink = re.compile(r'<a href="(.*?)">')
findimg = re.compile(r'<img alt="肖申克的救赎" class="" src="(.*)"width="100"/>')
def get(baseurl):
datalist = []
for i in range(0,2):
url=baseurl+str(i*25)
html=geturl(url)#'保存获取到的页源码'
soup = BeautifulSoup(html, 'html.parser')#'逐一解析数据'
for item in soup.findAll('div',class_="item"):#'查找符合要求的字符串,并形成列表'
data = [] # '用来存放电影的所有信息'
item=str(item)
link=soup.re.findall(findlink,item)[0]
data.append(link)
img=re.findall(findimg,item)
data.append(img)
datalist.append(data)
print(datalist)
def geturl():
url = 'https://movie.douban.com/top250'
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.63'}
req = urllib.request.Request(url, headers=head)
response = urllib.request.urlopen(req)
# print(response.read().decode('utf-8'))
html = response.read()
if __name__=='__main__':
main() |
|