luroot153 发表于 2018-4-7 11:53:45

python爬虫起点小说网站非VIP小说内容

本帖最后由 luroot153 于 2018-4-7 11:59 编辑

importrequests
frombs4 importBeautifulSoup
from urllib.request import urlopen
importtime

def get_url(url,path):
    header = {
      "Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
      "Accept - Encoding": "gzip, deflate, br",
      "Accept - Language": "zh - CN, zh;q = 0.9",
      "Connection": "keep - alive",
      "Cookie": "id=25defa3d4f45dacb||t=1508410061|et=730|cs=002213fd484a3a939f8c94ee5f; __ncuid=ce3e83c7-6f26-4bcc-b663-ce1984766c00",
      "Host": "googleads.g.doubleclick.net",
      "Referer": "http://googleads.g.doubleclick.net/pagead/ads?client=ca-pub-3633441210532529&output=html&h=90&slotname=6795393744&adk=1110379616&adf=1724644460&w=1200&fwrn=4&fwrnh=100&lmt=1522983610&loeid=38893312&rafmt=1&format=1200x90&url=http%3A%2F%2Fwww.xxsy.net%2Finfo%2F944459.html&ea=0&flash=0&fwr=0&rh=0&rw=1200&resp_fmts=3&wgl=1&dt=1522983610242&bpp=19&bdt=778&fdt=23&idt=70&shv=r20180402&cbv=r20170110&saldr=aa&correlator=5362080100378&frm=20&ga_vid=886220172.1522983610&ga_sid=1522983610&ga_hid=999865595&ga_fc=0&pv=2&iag=3&icsg=2&nhd=1&dssz=3&mdo=0&mso=0&u_tz=480&u_his=1&u_java=0&u_h=768&u_w=1366&u_ah=728&u_aw=1366&u_cd=24&u_nplug=4&u_nmime=5&adx=75&ady=136&biw=1349&bih=637&abxe=1&scr_x=0&scr_y=0&eid=21061122%2C38893302%2C20040066%2C21060858&oid=3&rx=0&eae=2&fc=528&brdim=0%2C0%2C0%2C0%2C1366%2C0%2C1366%2C728%2C1366%2C637&vis=1&rsz=%7C%7CeE%7C&abl=CS&ppjl=f&pfx=0&fu=8328&bc=1&ifi=1&fsb=1&dtd=121",
      "User - Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
    }
    html=requests.get(url)
    html.encoding='utf-8'
    html=html.text
    soup=BeautifulSoup(html,'lxml')
    #print(soup.prettify())#prettify()格式化代码,如果便签没有写全,那么就补全代码。
    # print('*'*40)
    # print(soup.title.string)
    k=1
    fori in soup.select('ul.cf > li > a '):
      print("正在下载第" + str(k)+"章,章节名是:"+i.string)
      with open(path + '\\reader.txt', 'a+', encoding='GB18030') as f:
            f.writelines(i.string)
            f.writelines('\n\n\n\n')
      u='https:'+i['href']
      htmls=urlopen(u)
      soups= BeautifulSoup(htmls, 'lxml')
      for j in soups.select('div > div.read-content.j_readContent > p'):
            with open(path+'\\reader.txt', 'a+', encoding='GB18030') as f:# 由于网页是utf-8,但是必须要以gbk写入,所以必须为GB18030
                f.writelines(j)
                f.writelines('\n')
      print("第"+str(k)+"章下载完毕!!!")
      print()
      k=k+1
      time.sleep(1)
    print("小说下载完毕,谢谢使用!")

url=input('请n您输入起点小说网站中要爬取的小说网址:')
path=input(r'请输入爬取小说您要存的文件夹[比如是"E:\b\a",,就填写为"E:\\b\\a".]:')
print('下载时间可能有点长,请耐心等待。。。。')
get_url(url,path)
页: [1]
查看完整版本: python爬虫起点小说网站非VIP小说内容