|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
py的代码:
# -*- coding:utf-8 -*-
import urllib2,re,MySQLdb
urla = 'http://www.quanshu.net/book/9/9055/' #这个是盗墓笔记
def getlist():
html = urllib2.urlopen(urla).read()
html = html.decode('gb2312').encode('utf-8')
reg = re.compile(r'<li><a href="(.*?)" title="(.*?)">(.*?)</a>')
urls = re.findall(reg,html)
return urls
def getcontent(url):
html = urllib2.urlopen(url).read()
html = html.decode('gbk').encode('utf-8')
reg = re.compile(r'style5\(\);</script>(.*?)<script type="text/javascript">')
content = re.findall(reg,html)
return content
class Sql(object):
conn = MySQLdb.connect(
host = 'localhost',
port = 3306,
user = 'root',
passwd = 'passwd',
db = "xiaoshuo",
charset = "utf8",
)
def adddata(self,title,content):
cur = self.conn.cursor()
cur.execute("insert into books values(NULL,'%s','%s')" %(title,content))
cur.close()
self.conn.commit()
mysql = Sql()
for i in getlist():
print '正在爬去%s' %i[1]
title = i[1]
content = getcontent(urla + i[0])
print '正在插入数据库 %s' %i[1]
mysql.adddata(title,content)
break
报错:
Python 2.7.12 (v2.7.12:d33e0cf91556, Jun 27 2016, 15:24:40) [MSC v.1500 64 bit (AMD64)] on win32
Type "copyright", "credits" or "license()" for more information.
>>>
==== RESTART: C:\Users\Administrator\Desktop\shujuwajue\全书网小说爬取.py ====
正在爬去国庆贺文,非盗墓笔记,免费奉送。,共6035字
正在插入数据库 国庆贺文,非盗墓笔记,免费奉送。,共6035字
Traceback (most recent call last):
File "C:\Users\Administrator\Desktop\shujuwajue\全书网小说爬取.py", line 45, in <module>
mysql.adddata(title,content)
File "C:\Users\Administrator\Desktop\shujuwajue\全书网小说爬取.py", line 34, in adddata
cur.execute("insert into books values(NULL,'%s','%s')" %(title,content))
File "build\bdist.win-amd64\egg\MySQLdb\cursors.py", line 205, in execute
self.errorhandler(self, exc, value)
File "build\bdist.win-amd64\egg\MySQLdb\connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
ProgrammingError: (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '; \\xe6\\x88\\x91\\xe6\\x8e\\xa5\\xe4\\xb8\\x8b\\xe6\\x9d\\xa5\\xe8\\xa6\\x81\\' at line 1")
>>>
import urllib2,re,MySQLdb 之后 加上这个试试
reload(sys)
sys.setdefaultencoding('utf8')
|
|