|
发表于 2024-4-23 04:39:03
|
显示全部楼层
帮你整合了一下代码:
- import requests
- from bs4 import BeautifulSoup
- import re
- import csv
- def fetchData(id):
- url = f'https://sofifa.com/player/{str(id)}'
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
- 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
- 'Referer': 'https://www.google.com/'
- }
- myRequest = requests.get(url)
- soup = BeautifulSoup(myRequest.text,'lxml')
- print(myRequest)
- meta = soup.find(attrs={'name':'description'})
- if meta is None:
- return None
-
- meta = meta['content']
- years = soup.find(name='span',attrs={'class':'bp3-button-text'})
- if meta[:4] != 'FIFA' and (str(years.string)) != "FIFA 23" or meta[:4]=='FIFA':
- return None
-
- info = soup.find(name='div',attrs={'class':'info'})
- playerName = info.h1.string
- myList = [id, playerName]
-
- rawdata = soup.select("#body > div:nth-child(5) > div > div.col.col-12 > div.bp3-card.player > div > div")
- offset = rawdata[0].find_all("span")
- offset = (len(offset))-1
- temp = rawdata[0].text
- temp = re.split('\s+',temp)
- if offset > 0:
- for i in range(offset):
- temp.pop(i)
-
- month = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
- mon = temp[3][1:]
- mon = month.index(mon)+1
- day = temp[4][:-1]
- year = temp[5][:-1]
- birthday = [f"{str(year)}/{str(mon)}/{str(day)}"]
- birthday = eval(str(birthday)[1:-1])
- myList.append(birthday)
-
- height = int(temp[6][:-2])
- myList.append(height)
- weight = int(temp[7][:-2])
- myList.append(weight)
-
- rawdata = soup.select("#body > div:nth-child(5) > div > div.col.col-12 > div:nth-child(2) > div > ul")
- temp = rawdata[0].find_all('li',class_="ellipsis")
- preferred_foot = temp[0].contents[1]
- preferred_foot = 1 if (preferred_foot == 'Left') else 2
- myList.append(preferred_foot)
-
- skill_move_level = temp[2].contents[0]
- myList.append(int(skill_move_level))
-
- reputation = temp[3].contents[0]
- myList.append(int(reputation))
-
- todostr = temp[4].text
- workrateString = re.split('\s+',todostr)
- wr_att = workrateString[1][4:-1]
- wr_def = workrateString[2]
- wrList = ['Low',"Medium","High"]
- wr_att = wrList.index(wr_att)+1
- wr_def = wrList.index(wr_def)+1
- myList.append(wr_att)
- myList.append(wr_def)
-
- rawdata = soup.select("#body > div:nth-child(5) > div > div.col.col-12 > div.bp3-card.player > img")
- img_url = rawdata[0].get("data-src")
- img_r = requests.get(img_url,stream=True)
- img_name = f"{id}_{playerName}.png"
- with open(f"./{img_name}","wb") as fi:
- for chunk in img_r.iter_content(chunk_size=120):
- fi.write(chunk)
-
- rawdata = soup.select("#body > div:nth-child(5) > div > div.col.col-12 > div.bp3-card.player > div > div > span")
- allPos = ''.join(f"{p.text} " for p in rawdata)
- myList.append(allPos)
-
- rawdata = soup.select("#body > div:nth-child(6) > div > div.col.col-4 > ul > li:nth-child(1) > span")
- bestPos = rawdata[0].text
- myList.append(bestPos)
-
- rawdata = soup.select("#body > div:nth-child(5) > div > div.col.col-12 > div:nth-child(4) > div > h5> a")
- club = rawdata[0].text if len(rawdata)>0 else "没有俱乐部"
- myList.append(club)
-
- rawdata = soup.select("#body > div:nth-child(5) > div > div.col.col-12 > div.bp3-card.player > div > div > a")
- nation = rawdata[0].get("title") if len(rawdata)>0 else "其他国家"
- myList.append(nation)
-
- rawdata = soup.select('#body>div:nth-child(6)>div>div.col.col-12')
- data = rawdata[0].find_all(class_=re.compile('bp3-tag p'))
- myList.extend(allatt.text for allatt in data)
-
- return myList
- def dealWithData(dataToWrite):
- header_list = ['id','name','birthday','height','weight','preferred_foot',"skill_move_level","reputation","wr_att","wr_def",'Positions','Best Position','Club',"nation",'Crossing','Finishing','Heading Accuracy', 'Short Passing','Volleys','Dribbling','Curve', 'FK Accuracy','Long Passing','Ball Control','Acceleration','Sprint Speed','Agility','Reactions','Balance','Shot Power','Jumping','Stamina','Strength','Long Shots','Aggression','Interceptions','Positioning','Vision','Penalties','Composure','Defensive Awareness','Standing Tackle','Sliding Tackle','GK Diving','GK Handling','GK Kicking','GK Positioning','GK Reflexes']
- with open('./output.csv', 'a+', encoding='utf-8-sig', newline='') as f:
- writer = csv.writer(f)
- writer.writerow(header_list)
- writer.writerows(dataToWrite)
- def getPlayerID(key):
- url = f"https://sofifa.com/players?keyword={str(key)}"
- myRequest = requests.get(url)
- soup = BeautifulSoup(myRequest.text,'lxml')
- playerTable = soup.select("#body>div.center>div>div.col.col-12>div>table>tbody")
- data = playerTable[0].contents
- playersCandicate = []
-
- if len(data) > 0:
- for p in data:
- id = p.find("img")["id"]
- name = p.find("a")["aria-label"]
- ovr = p.find(attrs={"data-col":"oa"}).get_text()
- playersCandicate.append([id,name,ovr])
- else:
- print("not found")
- playersCandicate.append(["not found","the name you're searching is >>", key])
-
- return playersCandicate
- if __name__ == "__main__":
- # 通过递增ID搜索
- for start in range(20000, 40000, 1000): # 每次爬取1000个球员
- soData = []
- for s in range(start, start + 1000):
- l = fetchData(s)
- if l != None:
- soData.append(l)
- dealWithData(soData)
- time.sleep(60) # 爬取完一批次后,休眠60秒
复制代码
|
|