shiey 发表于 2022-4-13 23:33:30

求助!!!!!! 为啥mysql 里没有数据????不知道哪错了~~~

# 导入支持库
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import re
import pymysql
import time

# 设置Request对象
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36"
}

# 数据库连接
connect = pymysql.Connect(
    host='localhost',
    port=3306,
    user='root',
    password='',
    db='test_spider',
    charset='utf8'
)
# 获取游标
cursor = connect.cursor()
# 执行插入语句
sql_in = '''insert into doubanmovie (name,director,actor,style,country,release_time,time,score) value (%s,%s,%s,%s,%s,%s,%s,%s)'''


def get_movie_url(url):# 获取每个电影的链接
    html = requests.get(url=url, headers=headers)
    selector = etree.HTML(html.text)
    movie_hrefs = selector.xpath('//div[@class="hd"]/a/@href')
    for movie_href in movie_hrefs:
      get_movie_info(movie_href)


def get_movie_info(url):
    html = requests.get(url=url, headers=headers)# <Response >
    selector = etree.HTML(html.text)# <Element html at 0x20892e10108>
    try:
      name = selector.xpath('//*[@id="content"]/h1/span/text()')
    except IndexError:
      name = ''
    try:
      director = selector.xpath('//*[@id="info"]/span/span/a/text()')
    except IndexError:
      director = ''
    try:
      actors = selector.xpath('//*[@class="actor"]/span')
      actor = actors.xpath('string(.)')
      # actors = re.findall('<a href="/.*?/" rel="v:starring">(.*?)</a>',html.text,re.S)
    except IndexError:
      actor = ''
    try:
      style = re.findall('<span property="v:genre">(.*?)</span>', html.text, re.S)
    except IndexError:
      style = ''
    try:
      country = re.findall('<span class="pl">制片国家/地区:</span> (.*?)<br/>', html.text, re.S)
    except IndexError:
      country = ''
    try:
      release_time = re.findall('<span property="v:initialReleaseDate" content=.*?>(.*?)</span>', html.text, re.S)
    except IndexError:
      release_time = ''
    try:
      time = re.findall('<span property="v:runtime" content=.*?>(.*?)</span>', html.text, re.S)
    except IndexError:
      time = ''
    try:
      score = selector.xpath('//*[@id="interest_sectl"]/div/div/strong/text()')
    except IndexError:
      score = ''
    cursor.execute(
      sql_in,
      
    )

    urls = ['https://movie.douban.com/top250?start={}&filter='.format(i) for i in range(0, 250, 25)]

    for url in urls:
      get_movie_url(url)
      time.sleep(5)
      print('我好了!')
    connect.commit()

wp231957 发表于 2022-4-14 06:44:25

在写入数据库之前先print一下你的数据
盲猜你根本就没取到数据

rsj0315 发表于 2022-4-14 08:09:21

print大法看看拿没拿到啊,

shiey 发表于 2022-4-14 19:32:21

有老铁帮忙实现一下吗py小白{:10_266:}

shiey 发表于 2022-4-14 19:33:18

wp231957 发表于 2022-4-14 06:44
在写入数据库之前先print一下你的数据
盲猜你根本就没取到数据

py小白 {:10_266:} 难受

wp231957 发表于 2022-4-14 19:40:32

shiey 发表于 2022-4-14 19:33
py小白难受

不知道你的代码是打哪来的,要知道单单是数据库就够写一本书的了
你这爬虫也不知会否,数据库也不知会否
就弄出这么复杂的代码,狠是佩服你

isdkz 发表于 2022-4-14 19:43:21


# 导入支持库
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from lxml import etree                           # 需要从 lxml 模块 导入 etree
import re
import requests                           # 需要导入 requests 模块
import pymysql
import time

# 设置Request对象
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36"
}

# 数据库连接
connect = pymysql.Connect(
    host='localhost',
    port=3306,
    user='root',
    password='secureworks',
    db='test_spider',
    charset='utf8'
)
# 获取游标
cursor = connect.cursor()
# 执行插入语句
sql_in = '''insert into doubanmovie (name,director,actor,style,country,release_time,time,score) value (%s,%s,%s,%s,%s,%s,%s,%s)'''


def get_movie_url(url):# 获取每个电影的链接
    html = requests.get(url=url, headers=headers)
    selector = etree.HTML(html.text)
    movie_hrefs = selector.xpath('//div[@class="hd"]/a/@href')
    for movie_href in movie_hrefs:
      get_movie_info(movie_href)


def get_movie_info(url):
    html = requests.get(url=url, headers=headers)# <Response >
    selector = etree.HTML(html.text)# <Element html at 0x20892e10108>
    try:
      name = selector.xpath('//*[@id="content"]/h1/span/text()')
    except IndexError:
      name = ''
    try:
      director = selector.xpath('//*[@id="info"]/span/span/a/text()')
    except IndexError:
      director = ''
    try:
      actors = selector.xpath('//*[@class="actor"]/span')
      actor = actors.xpath('string(.)')
      # actors = re.findall('<a href="/.*?/" rel="v:starring">(.*?)</a>',html.text,re.S)
    except IndexError:
      actor = ''
    try:
      style = re.findall('<span property="v:genre">(.*?)</span>', html.text, re.S)
    except IndexError:
      style = ''
    try:
      country = re.findall('<span class="pl">制片国家/地区:</span> (.*?)<br/>', html.text, re.S)
    except IndexError:
      country = ''
    try:
      release_time = re.findall('<span property="v:initialReleaseDate" content=.*?>(.*?)</span>', html.text, re.S)
    except IndexError:
      release_time = ''
    try:
      time = re.findall('<span property="v:runtime" content=.*?>(.*?)</span>', html.text, re.S)
    except IndexError:
      time = ''
    try:
      score = selector.xpath('//*[@id="interest_sectl"]/div/div/strong/text()')
    except IndexError:
      score = ''
    cursor.execute(
      sql_in,
      
    )

'''这一部分代码缩进错误'''
urls = ['https://movie.douban.com/top250?start={}&filter='.format(i) for i in range(0, 250, 25)]

for url in urls:
    get_movie_url(url)
    time.sleep(5)
    print('我好了!')
connect.commit()
'''这一部分代码缩进错误'''
页: [1]
查看完整版本: 求助!!!!!! 为啥mysql 里没有数据????不知道哪错了~~~