基于python2.7的爬虫,没有使用任何框架,全正则形式。
说不上算爬虫,就是一个采集器吧。
可使用同一规则同时采集多个分类,采集的数据保存到MYSQL。
#!/usr/bin/python
#coding:utf-8
import requests
import re
import os
import urllib
import time
import random
import MySQLdb
#python2.7版本
#CAOHQHQHQ
#数据库连接配置
def db_config():
#return ['server ip','root','password','database name']
#检查一url是否已经采集过
def check_log(url):
conf = db_config()
db = MySQLdb.connect(conf[0],conf[1],conf[2],conf[3])
create_sql = "create table if not exists spider_log(id int primary key auto_increment,url varchar(220) not null)default charset=utf8"
cur = db.cursor()
cur.execute(create_sql)
#检查一url是否已经存在
cur.execute("select id from spider_log where url ='%s'"%url)
result = cur.fetchone()
db.close()
if result != None:
return 1
else:
return 0
#添加进采集历史
def insert_log(url):
conf = db_config()
db = MySQLdb.connect(conf[0],conf[1],conf[2],conf[3])
cur = db.cursor()
cur.execute("insert into spider_log(url)values('%s')"%url)
db.commit()
db.close()
#发布数据
def insert_db(title,content):
conf = db_config()
db = MySQLdb.connect(conf[0],conf[1],conf[2],conf[3])
cur = db.cursor()
cur.execute("insert into news(title,content,category_id,publish_date)values('%s','%s',44,'%s')"%(title,content,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())))
db.commit()
db.close()
#域名配置 http://www.abc.com 不要包含最后斜线
def site_config():
return 'http://www.abc.com'
#图片下载
def down_pic(url,file_path = '/data/img_file'):
time_str = time.strftime("%Y-%m-%d",time.localtime())
time_str2 = time.strftime("%Y%m%d%H%M%S",time.localtime())
try:
file_path = file_path+'/'+time_str
if not os.path.exists(file_path):
os.makedirs(file_path) #不使用os.makedir, os.makedirs功能同mkdir -p
#图片扩展名
file_suffix = os.path.splitext(url)[1]
#新文件名
file_name = time_str2+str(random.randint(0,999999))+file_suffix
file_name = file_path +'/'+file_name
urllib.urlretrieve(url,file_name)
return file_name
except Exception as e:
print(e)
return url
#网址补全
def check_http(url):
www_rule = 'http://([^/]+)/'
check = re.search(www_rule,url)
if check == None:
return site_config()+url
else:
return url
#检查内容中的图片,对其进行网站补全 或 下载
def check_img(content):
img_rule = '<img(.+)src="([^ ]+)"'
result = re.findall(img_rule,content)
if result:
for src in result:
content = content.replace(src[1],down_pic(check_http(src[1])))
return content
#提取需要采集的url列表
def get_item(content):
split_start = '<ul class="news-list">' #截取开始
split_end = '</ul>' #截取结束
regex_rule = '<h4><a href="(.+)" class="title" target="_blank">(.+)</a></h4>' #链接正则
#第一次切片
first_result = content.split(split_start)[1]
#第二次切片
second_result = first_result.split(split_end)[0]
result = re.findall(regex_rule,second_result)
url = [] #定义一个结果List
if result:
for str in result:
if str != None:
url.append(str[0]) #把网址添加到List中
return url
else:
return 0
#提取内容页
#参数 要采集的子页url
#返回获取的内容 内容获取的结构是[标题,标签,内容] 标签可为空
#返回 0 如果返回0 则数据不合格
def get_content(url):
title_rule = '<h1>(.+)</h1>' #获取标题的正则
content_rule = '<center></center>([\S\s]*) <div class="ad640">' #正文的正则
tag_rule = '' #标签的正则
filter_rule = ['<div>(.*)</div>','<div>(.*)</div>'] #html标签过滤规则
result = [] #定义结果
html = requests.get(url)
title = re.search(title_rule,html.content)
if title != None:
result.append(title.group(1))
else:
return 0
content = re.search(content_rule,html.content)
if content != None:
result.append(strip_tags(content.group(1),['p','img'],['下一页',' style="text-align: center;"','alt="undefined" ',' ','”',' ','“',"'"]))
else:
return 0
return result
#过滤html标题,并可保留指定标签
#参数 内容,保留的标签如 ['p', 'img'],过滤的字符['小日本','印度阿三']
#返回过滤后的内容
def strip_tags(html,tags=None,strs=None):
for tag in tags:
html = html.replace('<'+tag,'{:tag'+tag)
html = html.replace('</'+tag+'>','{:tag/'+tag+'}')
#过滤
dr = re.compile(r'<([^>]+)>',re.S)
html = dr.sub('',html)
#还原保留的
for tag in tags:
html = html.replace('{:tag'+tag,'<'+tag)
html = html.replace('{:tag/'+tag+'}','</'+tag+'>')
#过滤字符
for str in strs:
html = html.replace(str,'')
return html
#定义采集的入口url
#url = "http://www.abc.com/shehuixinwen/"
#定义批量采集的url 必须符合相同的采集规则
urls = ['http://www.abc.com/yule/wanghong/','http://www.abc.com/yule/yinyue/','http://www.abc.com/yule/zongyi/','http://www.abc.com/yule/dianying/','http://www.abc.com/yule/dianshiju/','http://www.abc.com/yule/mingxingkandian/']
for url in urls:
try:
html = requests.get(url)
url_list = get_item(html.content)
for work_url in url_list:
if check_log(work_url) == 0:
news = get_content(work_url)
if news != 0:
news[1] = check_img(news[1])
print("标题")
print("--------------------------------------------------------------------------------")
print(news[0])
print("--------------------------------------------------------------------------------")
print("正文")
print(news[1])
insert_db(news[0],news[1])#发布数据
insert_log(work_url)#添加进采集历史
else:
print(work_url+'已经采集过了')
except Exception as e:
print(e)
非特殊说明,本博所有文章均为博主原创。
如若转载,请注明出处:https://www.isres.com/default/43.html