python获取指定域名在爱站的所有词和排名

爱站所查权重看关键词列表,有时候想导出,有点麻烦,直接复制也要整格式。
于是用python撸个脚本,按分隔符导出。就算进excel也按分隔符分列即可。
分隔符可以自定义,当前用的是连续3个下划线
20200921,增加指数的保存,保存到csv文件

#!/usr/bin/python
#coding:utf-8
import requests,re,os,time,urllib,urllib2,random
import json
import sys
import csv
reload(sys)
sys.setdefaultencoding('utf-8')

"""
python2.7.x版本
"""


#https://www.isres.com 运维blog


def data2cvs(data):
    try:
        path = '/tmp/aizhan.csv'
        csvfile = file(path, 'wb')
        csvfile.write(u'\ufeff'.encode('utf8'))
        writer = csv.writer(csvfile)
        writer.writerow(['keyword', 'page','index','url', 'title','zhishu'])
        writer.writerows(data)
        csvfile.close()
        print "保存数据成功"
    except ex as Exception:
        print "保存数据失败"

C = {"SITE":{},"DBCONFIG":{},"FIELD":{},"SAVEAPI":'',"Thread":1,"HEADER":{}}
C['SITE']['home'] = 'https://baidurank.aizhan.com'

if(C['SITE']['home'].find('https://') != -1):
    C['ISHTTPS'] = True
    C['HOST'] = C['SITE']['home'].replace("https://","")
else:
    C['ISHTTPS'] = False
    C['HOST'] = C['SITE']['home'].replace("http://", "")

C['HEADER'] = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36","Referer":C['HOST'],"Host":C['HOST'],"Accept-Language":"zh-CN,zh;q=0.8,zh-TW;q=0.6,ar;q=0.4,en;q=0.2","Accept-Encoding":"gzip, deflate","Connection":"keep-alive","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"}

Target='https://baidurank.aizhan.com/mobile/susanguitar.com/-1/0/0/position/1/'


html = requests.get(Target, headers=C['HEADER'])
if html != None:
    Cut_start = '<div class="baidurank-list">'
    Cut_end = '<div class="baidurank-pager">'
    Rows_Pattr = '<tr>(((?!tr)[\S\s])+?)</tr>';
    #根据设置的开始截止唯一字符,获取待操作区域
    Content = html.content
    first_result =  Content.split(Cut_start)[1]
    second_result = first_result.split(Cut_end)[0]
    #从待操作区域根据正则提供所有的表格行
    result = re.findall(Rows_Pattr,second_result)
    if result:
        data_list = []
        data_rows = []
        for row in result:
            Row_result = ''
            Rs_pattr = ['<a class="gray" rel="nofollow" target="_blank" href="(?:.+)" title="(.+?)">','第(\d+?)页','第(\d+?)页','<a name="baiduLink" rel="nofollow" target="_blank" href="(.+?)" class="gray" title="(.+?)">','<a class="gray" rel="nofollow" target="_blank" href="https://ci.aizhan.com/(?:.+)/">([\s\S]*?)</a>']
            
            for Pattr_index,Rs_pattr_c in enumerate(Rs_pattr):
                check = re.search(Rs_pattr_c,row[0])
                if check != None:
                    #Row_result += str(check.group(1))+"___"
                    data_rows.append(str(check.group(1)))
                else:
                    #Row_result += "/___"
                    data_rows.append("/")

                if Pattr_index == 3:
                    if check != None:
                        #Row_result += str(check.group(2))+"___"
                        data_rows.append(str(check.group(2)))
                    else:
                        #Row_result += "/___"
                        data_rows.append("/")
            #print Row_result
            #Row_result = ''
            data_list.append(data_rows)
            data_rows = []
    
        #print data_list
        data2cvs(data_list)
    else:
        print "匹配不到行,请检查数据"

else:
    print "err!"

[root@localhost]# python aizhan.py 
保存数据成功

数据保存在/tmp/aizhan.csv
66.jpg


标签: 爬虫

非特殊说明,本博所有文章均为博主原创。

最新文章

发表评论