|
发表于 2023-3-5 14:37:52
|
显示全部楼层
本帖最后由 isdkz 于 2023-3-5 14:49 编辑
- # -*- coding: utf-8 -*-
- """
- Created on Tue Feb 15 15:02:26 2023
- @author: Neal
- shareholder information of a stock are listed in :
- https://q.stock.sohu.com/cn/000001/ltgd.shtml
- https://q.stock.sohu.com/cn/000002/ltgd.shtml
- https://q.stock.sohu.com/cn/000003/ltgd.shtml
- ...
- And you are requried to collect the tables of shareholder information for stocks in "select_stocks"
- with following 7 columns, and then perform the analysis to answer the questions.
- 1. 'rank'-股票代码
- 2. 'rank'-排名
- 3. 'org_name'-股东名称
- 4. 'shares'-持股数量(万股)
- 5. 'percentage'-持股比例
- 6. 'changes'-持股变化(万股)
- 7. 'nature'-股本性质
- """
- import requests
- from bs4 import BeautifulSoup
- import pandas as pd
- import time
- fake_header = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
- "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
- "Accept-Encoding":"gzip, deflate, sdch",
- "Accept-Language":"zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2"
- }
- data_file= './data/stock_shareholders.csv'
- select_stocks = ('601186','601169','601166','601088','601006','600523',
- '600999','601988','600919','600887','600837','600606','600547',
- '600519','600518','600485','600340','601881','600104','600100')
- print('There are', len(select_stocks), 'stocks in select_stocks')
- base_url = 'https://q.stock.sohu.com/cn/{}/ltgd.shtml'
- row_count = 0
- #create a list to store the crawled share-holdoing records
- results=[]
- for stock in select_stocks:#process stock one by one
- #prepare the request webpage with desired parameters
- url = base_url.format(stock)
- print("Now we are crawling stock",stock)
- #send http request with fake http header
- response = requests.get(url,headers = fake_header)
- if response.status_code == 200:
- response.encoding = 'gbk'#++insert your code here++ look for charset in html
- root = BeautifulSoup(response.text,"html.parser")
- # search the table storing the shareholder information
- table = root.select_one('body > div.str2Column.clearfix > div.str2ColumnR > div.BIZ_innerMain > div.BIZ_innerBoard > div > div:nth-child(2) > table')#++insert your code here++
- print(table)
- # list all rows the table, i.e., tr tags
- rows = table.select('tr')#++insert your code here++
- for row in rows: #iterate rows
- record=[stock,]# define a record with stock pre-filled and then store columns of the row/record
- # list all columns of the row , i.e., td tags
- columns = row.select('td') #++insert your code here++
- for col in columns: #iterate colums
- record.append(col.get_text().strip())
- if len(record) == 7:# if has valid columns, save the record to list results
- #++insert your code here++ to add single "record" to list of "records"
- results.append(record)
- row_count+=1
- time.sleep(1)
- print('Crawled and saved {} records of shareholder information of select_stocks to{}'.format(row_count,data_file) )
- ####################### 如果你的代码中没有 data 这个文件夹,加上这段代码以免报错
- import os
- if not os.path.exists('data'):
- os.mkdir('data')
- #######################
- sharehold_records_df = pd.DataFrame(columns=['stock', 'rank','org_name','shares','percentage','changes','nature'], data=results)
- sharehold_records_df.to_excel("./data/sharehold_records.xlsx")
- print("List of shareholers are \n", sharehold_records_df['org_name'])
- #++insert your code here++ to answer Q3-1, Q3-2 and Q3-3
复制代码 |
|