|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 wongyusing 于 2018-4-12 13:49 编辑
代码如下:
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # @Time : 18-4-11 下午1:43
- # @Author : Aries
- # @Site :
- # @File : taobao.py
- # @Software: PyCharm
- import re
- from selenium import webdriver #浏览器驱动
- from selenium.webdriver.common.by import By #定位模块
- from selenium.webdriver.support.ui import WebDriverWait #页面元素
- from selenium.webdriver.support import expected_conditions as EC#显性等待页面加载
- from pyquery import PyQuery as pq #Python爬虫中的一种网页解析库
- import pymysql
- conn = pymysql.connect(host='localhost', db='test', user='root', passwd='wongyusing',charset='utf8')
- cur = conn.cursor()
- url = 'https://www.taobao.com/'
- browser = webdriver.Chrome() #浏览器
- wait = WebDriverWait(browser,10)#等待页面加载10s
- def search():
- try:
- browser.get(url)
- # 定位输入栏
- inpot = wait.until(
- EC.presence_of_element_located((By.CSS_SELECTOR,'#q'))
- )
- #定位”搜索“按钮
- submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button')))
- inpot.send_keys('咖啡')#输入内容,以后可以创建一个列表,从而实现批量爬取
- submit.click()#点击”搜索“按钮
- #获取总页数
- total = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))[0]
- get_content()#获取内容
- return total.text #共100页
- except TimeoutError:
- return search()
- def next_page(page_num):#翻页操作,根据输入的页码在输入栏中翻页
- try:
- #获取页码输入栏
- inpot = wait.until(
- EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
- )
- #定位并点击“确定”按钮
- submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
- inpot.clear()#清除之前输入的页码
- inpot.send_keys(page_num)#输入页码
- submit.click()#点击“确定"按钮
- wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_num)))
- get_content()#获取内容
- except TimeoutError:
- next_page(page_num)
- def get_content():#获取信息/内容
- #定位商品列表
- wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))
- html = browser.page_source #注意这里的mainsrp-itemlist(晚上)
- doc = pq(html) #有时会变成m-itemlist(下午)
- items = doc('#mainsrp-itemlist .items .item').items()#获取内容列表
- for item in items:
- content = {
- 'title': item.find('.title').text(), # 标题
- 'shop': item.find('.shop').text(), # 店铺名称
- 'price': item.find('.price').text(), # 价格
- 'url': item.find('.title .a').attr('href'), # 連接
- 'img': item.find('.pic .img').attr('src'), # 图片
- 'deal': item.find('.deal-cnt').text()[:-3], # 交易数量
- 'location': item.find('.location').text(), # 发货地点
- }
- print(content)#打印内容(测试),成功,内容如下一行
- #{'deal': '6429', 'title': '【旗舰店】雀巢\n咖啡\n法国进口冻干法式金牌黑\n咖啡\n100g+50g*2装', 'price': '¥\n138.00', 'shop': '雀巢官方旗舰店', 'location': '上海', 'img': '//g-search3.alicdn.com/img/bao/uploaded/i4/i2/745949152/TB1NoI1kQSWBuNjSszdXXbeSpXa_!!0-item_pic.jpg_180x180.jpg'}
- #cur.execute(content)#存入数据库MYSQL
- #conn.commit()
- def gogo():#主函数
- totla = search()#打开淘宝首页,输入”需要搜索的内容“。获取内容,返回最大页数
- totla = int(re.compile('(\d+)').search(totla).group(1))#得到总页数并迭代出来
- for i in range(2,totla+1):#因为第一页已经获取内容了,从第二页开始获取
- next_page(i)#获取内容后,在网页的底部的跳转页码栏中输入页码翻页
- #conn.close()
- if __name__ == '__main__':
- gogo()
复制代码
问题如下:
1.现在数据已经采集完毕,没有问题,但我入库总是报错。该如何修改?
- Traceback (most recent call last):
- File "/home/sing/桌面/TaoBao/taobao.py", line 92, in <module>
- gogo()
- File "/home/sing/桌面/TaoBao/taobao.py", line 85, in gogo
- totla = search()#打开淘宝首页,输入”需要搜索的内容“。获取内容,返回最大页数
- File "/home/sing/桌面/TaoBao/taobao.py", line 41, in search
- get_content()#获取内容
- File "/home/sing/桌面/TaoBao/taobao.py", line 81, in get_content
- cur.execute(content)#存入数据库MYSQL
- File "/usr/local/lib/python3.5/dist-packages/pymysql/cursors.py", line 165, in execute
- result = self._query(query)
- File "/usr/local/lib/python3.5/dist-packages/pymysql/cursors.py", line 321, in _query
- conn.query(q)
- File "/usr/local/lib/python3.5/dist-packages/pymysql/connections.py", line 859, in query
- self._execute_command(COMMAND.COM_QUERY, sql)
- File "/usr/local/lib/python3.5/dist-packages/pymysql/connections.py", line 1095, in _execute_command
- packet = prelude + sql[:packet_size-1]
- TypeError: unhashable type: 'slice'
复制代码
2.有没有一步一个脚印的MYSQL的教程或者博文??让我借鉴一下
|
|