鱼C论坛

 找回密码
 立即注册
查看: 2014|回复: 4

[已解决]还是58的,不过这回变成字符串合并的,有什么简单的办法

[复制链接]
发表于 2017-3-29 22:46:41 | 显示全部楼层 |阅读模式
20鱼币
  1. 这个是修改不成功的

  2. import urllib.request
  3. import os
  4. import re

  5. def url_open(url):
  6.     req = urllib.request.Request(url)
  7.     req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2372.400 QQBrowser/9.5.10548.400')
  8.     response = urllib.request.urlopen(url)
  9.     html = response.read()

  10.     return html


  11. num=list(range(1,5))
  12. for a in num:
  13.     a=str(a)
  14.     url="http://jdz.58.com/ershoufang/pn"+a+"/"
  15.     print(url)

  16.     html=url_open(url).decode("utf-8")
  17.     neirong=urllib.request.urlopen(url)
  18.     html=neirong.read().decode('utf-8')
  19.     '''
  20.     zongjia=re.compile(r'<div .*?qj-listright btall">.*?class="pri">(.*?)</b>(.*?)&nbsp;&nbsp;.*?(/d/d/d/d.*?)<br>.*?class="showroom">(.*?)</span>(/d{1,2}/d.*?)<br>.*?</div>')
  21.     print(zongjia)
  22.     <div .*?"qj-listright btall">.*?class="pri">(.*?)</b> (.*?)\s*?&nbsp;&nbsp;.*?(\d\d\d\d.*?)\s.*?class="showroom">.*?(\S*?).*?</span>.*?(\d{1,4}\S*?).*?</div>
  23.     <div class="qj-listright btall".*?class="pri">(\d{1,3}).*?&nbsp;&nbsp;\D*(\d{3,4}).*?"showroom">\D*(\S*?)\D*?(\d{2,4}).*?</div>
  24.     '''
  25.     zongjia=r"class='pri.*?(\d{2,3}.\d|\d{2,3})\D*?\d\d\d\d"
  26.     danjia=r"class='pri.*?(?=\d{2,3}.\d|\d{2,3})\D*?(\d\d\d\d)"
  27.     mianji=r"class='pri.*?(\d{2,3}.\d|\d{2,3})\D*?\d\d\d\d\D*?\d.*?\d\D*?(\d{2,4}.\d|\d{2,3})"
  28.     zongjia=re.compile(zongjia)
  29.     danjia=re.compile(danjia)
  30.     zongjia_list=zongjia.findall(html)
  31.     danjia_list=danjia.findall(html)
  32.     print(zongjia_list+danjia_list)
  33.     print()
复制代码

输出
C:\Users\admin\AppData\Local\Programs\Python\Python36\python.exe D:/练习存放/爬取58同城/爬取信息.py
http://jdz.58.com/ershoufang/pn1/
['76', '69.8', '60', '14.5', '65', '62', '47', '57', '54', '49.8', '48', '50', '35', '38', '60', '27', '30', '57', '85', '160', '56', '64.8', '53', '100', '50', '58', '280', '65', '30', '49.8', '38', '100', '58', '30', '36', '49.8', '38', '35', '58', '38', '188', '36', '95', '27', '38.5', '37.5', '35.8', '48', '50', '10', '61', '78', '13.5', '46', '230', '57', '36', '43', '26', '40', '22', '86', '60', '85', '42', '63', '70', '46', '52.8', '46', '53', '53', '58', '41', '27', '59.6', '17', '53', '21.1', '83', '42', '118', '50', '50', '88', '57', '39.8', '58', '26', '65', '78']

http://jdz.58.com/ershoufang/pn2/
['52.5', '46', '46', '73', '45', '22.5', '56.6', '112', '53', '27.8', '105', '57.8', '40', '39', '41.8', '83', '62', '36', '23.8', '68', '40', '24', '73', '37', '30', '55', '60', '120', '26', '55', '48', '87', '32', '80', '44.6', '52', '50', '26', '57', '80', '25', '46', '60', '39.5', '30', '43.8', '18', '23', '39', '45']

http://jdz.58.com/ershoufang/pn3/
['85', '68', '23.2', '61', '52', '50', '28', '22', '32', '43.6', '41', '32', '36.8', '18', '65', '25', '110', '65', '36', '17', '110', '45.5', '270', '26.8', '56', '99.9', '20.6', '88', '45', '56', '60', '32', '52', '79', '66', '28', '54', '40', '43.8', '55', '53', '20', '59', '49', '240', '66', '29', '27', '46', '51']

http://jdz.58.com/ershoufang/pn4/
['79', '66', '28', '54', '40', '43.8', '55', '53', '20', '59', '49', '240', '51', '66', '29', '27', '46', '51', '46', '39.8', '49', '47.5', '45', '46', '52', '53', '76', '52', '43', '46', '46', '66', '55', '42.5', '25', '55', '62', '32', '80', '29.6', '45', '70', '57', '50', '118', '27', '38', '28', '43', '59.8']


Process finished with exit code 0


这个是修改前的

  1. import urllib.request
  2. import os
  3. import re

  4. def url_open(url):
  5.     req = urllib.request.Request(url)
  6.     req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2372.400 QQBrowser/9.5.10548.400')
  7.     response = urllib.request.urlopen(url)
  8.     html = response.read()

  9.     return html


  10. num=list(range(1,5))
  11. for a in num:
  12.     a=str(a)
  13.     url="http://jdz.58.com/ershoufang/pn"+a+"/"
  14.     print(url)

  15.     html=url_open(url).decode("utf-8")
  16.     neirong=urllib.request.urlopen(url)
  17.     html=neirong.read().decode('utf-8')
  18.     '''
  19.     zongjia=re.compile(r'<div .*?qj-listright btall">.*?class="pri">(.*?)</b>(.*?)&nbsp;&nbsp;.*?(/d/d/d/d.*?)<br>.*?class="showroom">(.*?)</span>(/d{1,2}/d.*?)<br>.*?</div>')
  20.     print(zongjia)
  21.     <div .*?"qj-listright btall">.*?class="pri">(.*?)</b> (.*?)\s*?&nbsp;&nbsp;.*?(\d\d\d\d.*?)\s.*?class="showroom">.*?(\S*?).*?</span>.*?(\d{1,4}\S*?).*?</div>
  22.     <div class="qj-listright btall".*?class="pri">(\d{1,3}).*?&nbsp;&nbsp;\D*(\d{3,4}).*?"showroom">\D*(\S*?)\D*?(\d{2,4}).*?</div>
  23.     '''

  24.     zongjia=re.compile(r"class='pri.*?(\d{2,3}.\d|\d{2,3})\D*?(\d\d\d\d)\D*?(\d).*?(\d)\D*?(\d{2,4}.\d|\d{2,3})")
  25.     zongjia_list=zongjia.findall(html)
  26.     print(zongjia_list)
  27.     print()
复制代码

输出
C:\Users\admin\AppData\Local\Programs\Python\Python36\python.exe D:/练习存放/爬取58同城/爬取信息.py
http://jdz.58.com/ershoufang/pn1/
[('76', '5671', '4', '2', '134'), ('69.8', '4417', '5', '2', '158'), ('49.8', '3557', '3', '2', '140'), ('35', '2916', '3', '1', '120'), ('54', '5934', '3', '2', '91'), ('58', '4360', '3', '2', '133'), ('100', '3125', '6', '3', '320'), ('36', '3711', '2', '1', '97'), ('30', '3260', '2', '1', '92'), ('62', '5636', '3', '1', '110'), ('35', '4814', '2', '1', '72.6'), ('49.8', '5291', '3', '1', '94.1'), ('57', '4071', '3', '2', '140'), ('38', '4318', '2', '1', '88'), ('30', '2222', '3', '2', '135'), ('38', '4750', '2', '1', '80'), ('60', '4545', '3', '2', '132'), ('57', '2572', '5', '3', '221.5'), ('160', '3200', '9', '3', '500'), ('100', '6451', '4', '2', '155'), ('27', '2934', '3', '1', '92'), ('65', '5652', '3', '2', '115'), ('38', '2533', '4', '2', '150'), ('49.8', '3608', '3', '2', '138'), ('50', '3333', '4', '2', '150'), ('14.5', '2416', '1', '1', '60'), ('30', '3846', '2', '1', '78'), ('58', '3314', '3', '2', '175'), ('50', '3311', '4', '2', '151'), ('48', '4948', '2', '1', '97'), ('65', '4452', '4', '2', '146'), ('38', '2533', '2', '1', '150'), ('60', '4511', '3', '2', '133'), ('36', '4186', '2', '1', '86'), ('188', '3760', '6', '2', '500'), ('53', '2523', '5', '2', '210'), ('58', '4461', '3', '2', '130'), ('280', '4242', '8', '3', '660'), ('47', '5053', '2', '1', '93'), ('85', '3309', '5', '3', '256.8'), ('64.8', '4800', '4', '2', '135'), ('56', '6222', '2', '1', '90'), ('39', '4333', '3', '1', '90'), ('44', '4489', '2', '1', '98'), ('45', '4128', '3', '1', '109'), ('75', '2707', '4', '6', '277'), ('28', '2333', '3', '2', '120'), ('32', '3200', '3', '1', '100'), ('78', '2932', '6', '3', '266'), ('70', '2187', '6', '2', '320'), ('74', '2792', '4', '2', '265'), ('39', '4534', '2', '1', '86'), ('41', '4315', '3', '1', '95'), ('51', '3642', '3', '2', '140'), ('13', '2363', '2', '1', '55'), ('58', '3352', '4', '2', '173'), ('37', '3874', '3', '1', '95.5'), ('58', '4055', '3', '2', '143'), ('38', '2657', '3', '2', '143'), ('46', '3565', '3', '1', '129'), ('37.8', '3780', '2', '1', '100'), ('48.6', '3654', '3', '2', '133'), ('24.5', '2487', '3', '2', '98.5'), ('210', '7000', '5', '3', '300'), ('130', '2600', '7', '3', '500'), ('26', '4333', '2', '1', '60'), ('36', '3600', '3', '1', '100'), ('27.6', '3680', '2', '1', '75'), ('20.6', '3678', '2', '1', '56'), ('38', '3958', '2', '1', '96'), ('19', '2065', '2', '1', '92'), ('32', '3764', '3', '1', '85'), ('186', '8183', '6', '3', '227.3'), ('30', '6666', '1', '1', '45'), ('43', '2986', '3', '2', '144'), ('39', '3000', '5', '1', '130'), ('23.5', '2937', '2', '1', '80'), ('16', '2133', '2', '1', '75'), ('41.5', '4234', '2', '1', '98'), ('59', '6145', '2', '1', '96'), ('75', '7142', '3', '1', '105'), ('30', '3092', '2', '1', '97'), ('43.8', '4093', '2', '1', '107'), ('45', '4736', '2', '1', '95'), ('38', '3800', '3', '2', '100'), ('23.8', '2644', '2', '1', '90'), ('46.6', '3451', '3', '1', '135'), ('57', '4191', '3', '2', '136'), ('158', '6869', '5', '4', '230'), ('76', '4523', '3', '2', '168'), ('72', '3200', '4', '3', '225'), ('41', '3867', '3', '1', '106')]

http://jdz.58.com/ershoufang/pn2/
[('41.6', '2971', '4', '2', '140'), ('58', '6987', '3', '2', '83'), ('56.6', '5145', '3', '2', '110'), ('33', '3300', '2', '1', '100'), ('14.6', '3173', '2', '1', '46'), ('17', '2125', '3', '1', '80'), ('32', '3200', '2', '1', '100'), ('27', '3000', '3', '1', '90'), ('42', '4516', '3', '1', '93'), ('29', '3536', '2', '1', '82'), ('46', '4842', '2', '1', '95'), ('95', '3941', '5', '3', '241'), ('39', '4333', '3', '1', '90'), ('52.5', '5769', '3', '1', '91'), ('41.8', '4180', '2', '1', '100'), ('39', '3223', '2', '1', '121'), ('41', '4823', '2', '1', '85'), ('26', '3250', '3', '1', '80'), ('56', '4480', '3', '1', '125'), ('36.8', '3956', '2', '1', '93'), ('46', '3377', '4', '2', '136.2'), ('28', '3043', '2', '1', '92'), ('28', '4000', '2', '1', '70'), ('55', '6250', '2', '1', '88'), ('46', '4693', '3', '1', '98'), ('72', '5853', '4', '2', '123'), ('24', '3428', '3', '1', '70'), ('60', '7142', '3', '2', '84'), ('25', '3125', '3', '1', '80'), ('73', '3543', '5', '2', '206'), ('55', '4583', '3', '1', '120'), ('39.8', '4422', '2', '1', '90'), ('56', '4341', '3', '2', '129'), ('73.5', '5326', '3', '2', '138'), ('48.6', '3600', '3', '2', '135'), ('45', '2976', '4', '1', '151.2'), ('58', '4833', '3', '1', '120'), ('205', '4767', '8', '6', '430'), ('78', '6446', '3', '2', '121'), ('75', '2777', '6', '3', '270'), ('265', '9532', '8', '5', '278'), ('109', '2978', '6', '3', '366'), ('65', '5371', '3', '2', '121'), ('58', '4640', '3', '2', '125'), ('260', '5829', '6', '2', '446'), ('53', '5888', '2', '1', '90'), ('19', '4579', '1', '1', '41.4'), ('128', '8152', '2', '2', '157'), ('50', '3521', '3', '2', '142')]

http://jdz.58.com/ershoufang/pn3/
[('70', '4729', '4', '2', '148'), ('63', '5727', '3', '1', '110'), ('32', '2909', '3', '1', '110'), ('115', '6571', '4', '2', '175'), ('53', '3419', '3', '2', '155'), ('32', '3555', '3', '1', '90'), ('55', '5000', '3', '1', '110'), ('25.8', '3225', '3', '1', '80'), ('28.5', '3800', '2', '1', '75'), ('49', '4298', '2', '2', '114'), ('59.6', '4197', '3', '2', '142'), ('76.8', '5688', '4', '2', '135'), ('48', '3478', '3', '2', '138'), ('46', '5281', '2', '1', '87.1'), ('30.5', '5980', '1', '1', '51'), ('72', '3200', '5', '3', '225'), ('51', '5100', '3', '1', '100'), ('59.8', '4288', '4', '2', '139.4'), ('35', '3043', '3', '1', '115'), ('36.8', '4088', '2', '1', '90'), ('72.3', '4084', '5', '3', '177'), ('50', '3623', '3', '2', '138'), ('33', '3666', '2', '1', '90'), ('16.2', '2700', '1', '1', '60'), ('28', '5490', '1', '1', '51'), ('52', '6516', '2', '1', '79.8'), ('68', '4927', '3', '2', '138'), ('44', '4000', '3', '2', '110'), ('90', '6000', '3', '2', '150'), ('65', '5416', '3', '2', '120'), ('35.8', '3977', '2', '1', '90'), ('32', '2461', '3', '1', '130'), ('21', '2333', '3', '1', '90'), ('40.8', '3709', '3', '2', '110'), ('35', '3365', '2', '1', '104'), ('60', '4477', '3', '2', '134'), ('55.5', '3881', '3', '2', '143'), ('25.8', '3440', '3', '1', '75'), ('49.5', '4583', '2', '1', '108'), ('65', '4676', '4', '2', '139'), ('28', '3333', '2', '1', '84'), ('69', '6388', '3', '1', '108'), ('39.8', '3061', '5', '1', '130'), ('47.5', '3275', '3', '1', '145'), ('37', '4352', '2', '1', '85'), ('26.8', '3828', '2', '1', '70'), ('36', '6545', '1', '1', '55'), ('53', '3403', '3', '2', '155.7'), ('66', '5500', '3', '1', '120'), ('27', '3375', '3', '1', '80')]

http://jdz.58.com/ershoufang/pn4/
[('32', '2461', '3', '1', '130'), ('21', '2333', '3', '1', '90'), ('40.8', '3709', '3', '2', '110'), ('35', '3365', '2', '1', '104'), ('60', '4477', '3', '2', '134'), ('55.5', '3881', '3', '2', '143'), ('128', '2327', '6', '3', '550'), ('49.5', '4583', '2', '1', '108'), ('65', '4676', '4', '2', '139'), ('28', '3333', '2', '1', '84'), ('69', '6388', '3', '1', '108'), ('39.8', '3061', '5', '1', '130'), ('47.5', '3275', '3', '1', '145'), ('37', '4352', '2', '1', '85'), ('49', '3769', '3', '2', '130'), ('26.8', '3828', '2', '1', '70'), ('36', '6545', '1', '1', '55'), ('53', '3403', '3', '2', '155.7'), ('25', '3333', '2', '1', '75'), ('43.3', '5280', '2', '1', '82'), ('235', '7833', '5', '3', '300'), ('39', '4333', '3', '1', '90'), ('25.8', '3440', '3', '1', '75'), ('38.7', '4662', '2', '1', '83'), ('16.8', '2800', '2', '1', '60'), ('66', '5500', '3', '1', '120'), ('27', '3375', '3', '1', '80'), ('15', '1764', '2', '1', '85'), ('48', '5052', '2', '1', '95'), ('38', '3304', '3', '1', '115'), ('27', '2934', '3', '1', '92'), ('46', '5111', '2', '1', '90'), ('32.8', '3538', '2', '1', '92.7'), ('47', '3333', '3', '2', '141'), ('62', '5794', '3', '1', '107'), ('90', '5287', '5', '2', '170.2'), ('25.8', '3440', '3', '1', '75'), ('35', '2333', '4', '3', '150'), ('27', '3000', '3', '1', '90'), ('48', '4119', '3', '1', '116.5'), ('56', '2466', '5', '2', '227'), ('31', '3780', '3', '1', '82'), ('30', '3333', '2', '1', '90'), ('97.6', '3253', '4', '2', '300'), ('30', '3614', '2', '1', '83'), ('190', '4222', '6', '3', '450'), ('37', '3814', '2', '1', '97'), ('50', '3623', '3', '2', '138'), ('49.8', '5533', '2', '1', '90'), ('62', '4366', '3', '2', '142')]


Process finished with exit code 0

我想的是最后输出32万2461m2 3室1厅 130m2,然后存储。

最佳答案
2017-3-29 22:46:42
jia3168 发表于 2017-3-30 22:16
最好给我个示范,我材料专业的。感觉学的很艰难,想找个人指导我都不好弄。

http://bbs.fishc.com/thread-84993-1-1.html
这是昨天写的爬鱼C的信息并保存表格的完整爬虫,整个思路都在里面,你自己看吧,一般爬虫大体上差不多都是这个套路,只是信息的解析方式不同而已

最佳答案

查看完整内容

http://bbs.fishc.com/thread-84993-1-1.html 这是昨天写的爬鱼C的信息并保存表格的完整爬虫,整个思路都在里面,你自己看吧,一般爬虫大体上差不多都是这个套路,只是信息的解析方式不同而已
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

发表于 2017-3-29 22:46:42 | 显示全部楼层    本楼为最佳答案   
jia3168 发表于 2017-3-30 22:16
最好给我个示范,我材料专业的。感觉学的很艰难,想找个人指导我都不好弄。

http://bbs.fishc.com/thread-84993-1-1.html
这是昨天写的爬鱼C的信息并保存表格的完整爬虫,整个思路都在里面,你自己看吧,一般爬虫大体上差不多都是这个套路,只是信息的解析方式不同而已
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

发表于 2017-3-30 09:24:08 | 显示全部楼层
你把每个房子的信息作为一个item = {}字典形式
然后比如价格就是item['price'] = 你用正则提取到的价格信息
同理,其他的信息也是这样表达
最后你就得到了一个键值对的字典形式了
储存字典的形式不用我教吧
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

发表于 2017-3-30 09:27:31 | 显示全部楼层
http://bbs.fishc.com/thread-84760-1-1.html
这个里面从23行开始往下就是提取信息到储存信息的过程,你看看吧
我先把所有信息放在字典里面,然后把要 的信息变成一个列表,然后把列表用逗号连起来成一个字符串(为了满足CSV的分列格式),之后保存到表格中
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

 楼主| 发表于 2017-3-30 22:16:31 | 显示全部楼层
gopythoner 发表于 2017-3-30 09:27
http://bbs.fishc.com/thread-84760-1-1.html
这个里面从23行开始往下就是提取信息到储存信息的过程,你看 ...

最好给我个示范,我材料专业的。感觉学的很艰难,想找个人指导我都不好弄。
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-5-17 11:01

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表