本文將介紹如何通過(guò)python編寫(xiě)爬蟲(chóng)代碼,查詢(xún)指定網(wǎng)站百度收錄的網(wǎng)頁(yè)信息,然后獲取每個(gè)收錄的頁(yè)面的關(guān)鍵詞,查詢(xún)?cè)撽P(guān)鍵詞在百度搜索結(jié)果中的排名,是一個(gè)非常實(shí)用的全自動(dòng)獲取指定網(wǎng)站頁(yè)面關(guān)鍵詞排名排名python腳本
以下為代碼部分:
使用參數(shù):seo.py www.szsudu.com 是否獨(dú)立保存結(jié)果 起始關(guān)鍵詞 鏈接文件
seo.py 是腳本文件的文件名
www.szsudu.com 是需要檢測(cè)的目標(biāo)網(wǎng)站,不需要加http://上去
是否獨(dú)立保存結(jié)果 可選值1或0,1表示將有排名的和無(wú)排名的關(guān)鍵詞結(jié)果分別保存在2個(gè)文件中,0則表示不分開(kāi)保存;
起始關(guān)鍵詞 檢測(cè)任務(wù)起始關(guān)鍵詞,常用于程序中斷后繼續(xù)檢測(cè);
鏈接文件 文件名稱(chēng),如url.txt,該參數(shù)讓程序直接讀取該文件中保存的url鏈接,而不再去查詢(xún)百度中的收錄情況。
# seo.py www.szsudu.com 1 "專(zhuān)利" szsudu.txt 讀取szsudu.txt中的url鏈接,獲取每個(gè)頁(yè)面的關(guān)鍵詞,查詢(xún)百度搜索對(duì)應(yīng)關(guān)鍵詞結(jié)果中www.szsudu.com網(wǎng)站的排名,并將結(jié)果分為有排名、無(wú)排名兩個(gè)文件存儲(chǔ)
#!/usr/bin/python
import os
import requests
import re
import time
import sys
import colorama
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
#from selenium.webdriver.support.wait import WebDriverWait
colorama.init(autoreset=True)
todayStr = time.strftime("%Y%m%d",time.localtime())
splitSave = 0
startPos = ""
urlFile = ""
onlyFromUrlFile=False #是否只檢測(cè)urlFile文件中已經(jīng)收錄的頁(yè)面
saveUnclude = False #是否保存無(wú)排名的關(guān)鍵詞查詢(xún)結(jié)果(HTML)
# 添加無(wú)界面參數(shù)
options = webdriver.ChromeOptions()
options.add_argument('--headless')
#browser = webdriver.Chrome(options=options)
browser = webdriver.Chrome()
#wait= WebDriverWait(browser,10)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
}
def toInt(num):
if num !='':
return int(num)
else:
return 0
#日志保存
def doLog(vstr):
with open(todayStr + ".log",'a',encoding='utf-8') as fo:
if vstr !="":
fo.writelines(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) + "\t" + vstr + "\n")
else:
fo.writelines(time.strftime("\n"))
fo.close()
#處理文件夾路徑
def pathRep(vstr):
if vstr !="":
vstr = vstr.replace('/','\\').strip('\\') + '\\'
return vstr
def formatFloat(num):
if num != '':
return '{:.2f}'.format(num)
def saveFile(vfile,vstr):
if vfile !="":
with open(vfile,'a',encoding='utf-8') as fo:
fo.writelines(vstr+"\n") #不存在的內(nèi)容才寫(xiě)入
fo.close()
def saveKeyword(vstr):
if vstr !="":
#先判斷內(nèi)容是否存在
if os.path.isfile(todayStr + '_' + url_ + '_關(guān)鍵詞.txt'):
with open(todayStr + '_' + url_ + '_關(guān)鍵詞.txt','r',encoding='utf-8') as f:
oldTxt = f.read()
f.close()
#print("當(dāng)前文件大小" + str(len(oldTxt)))
if oldTxt.find(vstr + "\n") != -1:
print("\033[0;31;40m\t 關(guān)鍵詞[" + vstr + "]已提取,自動(dòng)忽略\033[0m ")
return ""
print("\033[0;32;40m\t 關(guān)鍵詞[" + vstr + "]已保存\t\033[0m ")
with open(todayStr + '_' + url_ + '_關(guān)鍵詞.txt','a',encoding='utf-8') as fo:
fo.writelines(vstr+"\n") #不存在的內(nèi)容才寫(xiě)入
fo.close()
#從代碼中提取關(guān)鍵詞信息
def pickKW(_str):
if len(_str)>100:
#doLog(_str)
#利用正則,提取keywords信息
_arr = re.findall('<meta content="(.*?)" name="keywords"',_str,re.I)
if len(_arr)<1:
_arr = re.findall('<meta name="keywords" content="(.*?)"',_str,re.I)
if len(_arr)>0:
_kw = _arr[0]
#將關(guān)鍵詞寫(xiě)入文件
if _kw !="":
print("\033[0;32;40m\t 頁(yè)面標(biāo)簽:"+ _kw + "\t\033[0m ")
#對(duì)關(guān)鍵詞進(jìn)行拆分
_arr = _kw.replace(',',',').split(',')
for ki in _arr:
saveKeyword(ki)
else:
print("\033[0;31;40m\t未發(fā)現(xiàn)有關(guān)關(guān)鍵詞標(biāo)簽\033[0m ")
#百度url解碼
def urlDecode(BDurl):
try:
res = requests.get(BDurl,allow_redirects=False)
Real_url=res.headers['Location']
return Real_url
except requests.exceptions.ConnectionError as e:
print('ConnectionError', e.args)
return("")
except requests.exceptions.MissingSchema as e:
print('Schema is none', e.args)
return("")
except:
return("")
#判斷關(guān)鍵詞是否存在,并獲取排名
def getKeyOrder(vstr):
if len(vstr)>200:
if vstr.find(url_) != -1:
#通過(guò)正則查找
result = re.findall('class="result c-container new-pmd"[\s\S]*?style="text-decoration:none;position:relative;"\>(.*?)\<\/a\>',vstr,re.I)
j=0
for oi in result:
j = j+1
if oi.find(url_) != -1:
return j
return 0
#獲取搜索關(guān)鍵詞排名
def getKeySearch(vkey):
global browser
if vkey !="":
try:
curUrl = browser.current_url
if curUrl.find('www.baidu.com') <0:
browser.get('https://www.baidu.com/')
time.sleep(1)
#找到輸入框
obj = browser.find_element_by_id('kw')
obj.clear()
obj.send_keys(vkey)
browser.find_element_by_xpath("//*[@id='su']").click()
time.sleep(2)
_str_ = browser.page_source
if _str_.find('百度安全驗(yàn)證') != -1:
doLog('搜索 ' + vkey + ' 時(shí)出現(xiàn)安全驗(yàn)證,3秒后關(guān)閉瀏覽器并重新打開(kāi)')
if splitSave>0:
time.sleep(10)
else:
#browser.get('http://www.baidu.com') #訪(fǎng)問(wèn)一下百度首頁(yè)
browser.quit()
time.sleep(1)
browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
time.sleep(5)
obj = browser.find_element_by_id('kw')
obj.send_keys(vkey)
browser.find_element_by_xpath("//*[@id='su']").click()
time.sleep(2)
_str_ = browser.page_source
_order = getKeyOrder(_str_)
#print(_order)
if _order>0:
print("\033[0;31;40m\t\t\t" + vkey + ' 在百度中排名 第1頁(yè)第' + str(_order) + "名\t\033[0m ")
saveFile(todayStr + '_' + url_ + '_order.txt',"\t\t\t" + vkey+"\t 在百度中排名第1頁(yè)第" + str(_order))
return _order
else:
#翻頁(yè)
for i in range(2,6):
time.sleep(1)
nextObj = browser.find_elements_by_css_selector("a[class='n']")
if len(nextObj)>0:
nextObj[len(nextObj)-1].click()
time.sleep(2)
_str_ = browser.page_source
_order = getKeyOrder(_str_)
if toInt(_order):
print("\033[0;31;40m\t\t\t" + vkey + ' 在百度中排名第' + str(i) + '頁(yè)第' + str(_order) + "\t\033[0m ")
saveFile(todayStr + '_' + url_ + '_order.txt',"\t\t\t" + vkey+"\t 在百度中排名第" + str(i) + "頁(yè)第" + str(_order))
return _order
print("\033[0;32;40m 關(guān)鍵詞[" + vkey + "] 無(wú)排名\t\033[0m ")
if splitSave != 0:
saveFile(todayStr + '_' + url_ + '_order_無(wú)排名.txt',vkey+" 無(wú)排名")
else:
saveFile(todayStr + '_' + url_ + '_order.txt',vkey+" 無(wú)排名")
if saveUnclude:
saveFile(vkey+'.txt',_str_)
return 0
except TimeoutException:
print("請(qǐng)求超時(shí)")
return ""
else:
print("無(wú)效的鏈接")
return ""
#獲取頁(yè)面源代碼
def getHtml(vurl):
global browser
if vurl !="":
try:
browser.get(vurl)
_str_ = browser.page_source
if _str_.find('百度安全驗(yàn)證') != -1:
doLog('訪(fǎng)問(wèn) ' + vurl + ' 時(shí)出現(xiàn)安全驗(yàn)證,3秒后關(guān)閉瀏覽器并重新打開(kāi)')
if splitSave>0:
time.sleep(10)
else:
#browser.get('http://www.baidu.com') #訪(fǎng)問(wèn)一下百度首頁(yè)
browser.quit()
time.sleep(1)
browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
time.sleep(5)
browser.get(vurl)
return browser.page_source
except TimeoutException:
print("請(qǐng)求超時(shí)")
return ""
else:
print("無(wú)效的鏈接")
return ""
print("程序成功啟動(dòng)")
if len(sys.argv)<2:
print("參數(shù)不足")
else:
url_ = sys.argv[1]
if len(sys.argv)>2:
splitSave = toInt(sys.argv[2]) #是否分開(kāi)保存有排名、無(wú)排名結(jié)果
if len(sys.argv)>3:
startPos = sys.argv[3]
if len(sys.argv)>4:
urlFile = sys.argv[4]
url_ = url_.replace("http://","")
url_ = url_.replace("https://","")
kwArr = []
#提取 [日期]_url.txt 文本,如果不存在,從百度獲取數(shù)據(jù)
if not os.path.isfile(todayStr + '_' + url_ + '_關(guān)鍵詞.txt'):
#當(dāng)日可能未抓取,先從百度中提取鏈接
urlArr = []
if urlFile !="":
if not os.path.isfile(urlFile):
urlFile=""
if urlFile=="":
for page in range(0,10):
print("正在從百度獲取URL,第"+str(page+1)+"頁(yè)")
htmlStr = getHtml("http://www.baidu.com/s?ie=utf-8&wd=site%3A%3A"+url_+"&pn=" + str(page*10)) #采用site::網(wǎng)址的形式,得出來(lái)的結(jié)果會(huì)更多
#if htmlStr.find('class="n">下一頁(yè)') != -1:
# _str = getHtml("http://www.baidu.com/s?ie=utf-8&wd=site%3A"+url_+"&rn=50&pn=" + str(page))
# htmlStr = htmlStr + _str
a_href =re.findall('<a.*?href="http\:\/\/www\.baidu\.com\/link\?url=(.*?)".*?',htmlStr,re.I) #利用正則,提取所有a鏈接
a_href = set(a_href) #過(guò)渡重復(fù)的鏈接
for i in a_href:
if i!="":
#提取真實(shí)url
real_url = urlDecode("http://www.baidu.com/link?url=" + i)
if real_url.find(url_) != -1:
if not real_url in urlArr:
urlArr.append(real_url)
saveFile(todayStr + '_' + url_ + '_url.txt',real_url)
print("獲取到鏈接:" + real_url)
else:
print(real_url + "已收錄")
#如果沒(méi)有“下一頁(yè)”,則結(jié)束
if htmlStr.find('class="n">下一頁(yè)') == -1:
break;
else:
#獲取鏈接文件中的URL
if os.path.isfile("已收錄_" + urlFile):
with open("已收錄_" + urlFile) as uf:
uStr = uf.read()
uf.close()
if uStr !='':
urlArr = uStr.split("\n")
else:
with open(urlFile) as uf:
uStr = uf.read()
uf.close()
if uStr !='':
urlArr = uStr.split("\n")
if onlyFromUrlFile== True:
tmpArr = []
for ui in urlArr:
if ui !="":
htmlStr = getHtml("http://www.baidu.com/s?ie=utf-8&wd="+ui)
if htmlStr.find("<b>" + url_.replace("http://","")) != -1:
print(ui + "已收錄")
tmpArr.append(ui)
saveFile("已收錄_" + urlFile,ui)
else:
print("\033[0;32;40m\t\t " + ui + "頁(yè)面未收錄\033[0m ")
#time.sleep(1)
if len(tmpArr)>0:
urlArr = tmpArr
else:
print("無(wú)法獲取配置文件中的內(nèi)容," + urlFile)
print("鏈接獲取完成,即將提取關(guān)鍵詞標(biāo)簽")
#遍歷urlArr中的數(shù)據(jù)
for ui in urlArr:
_ui = ui
if _ui.find('://') == -1:
_ui = 'http://' + _ui
res = requests.get(_ui)
if len(res.text)>100:
#提取關(guān)鍵詞
pickKW(res.text)
#讀取關(guān)鍵詞
with open(todayStr + '_' + url_ + '_關(guān)鍵詞.txt','r',encoding='utf-8') as f:
kwStr = f.read()
f.close()
if kwStr !='':
kwArr = kwStr.split("\n")
#查詢(xún)排名
for ki in kwArr:
if ki!="":
if startPos!="":
if ki==startPos:
startPos = ""
else:
print("跳過(guò)檢測(cè)[" + ki + "]關(guān)鍵詞")
continue
print("準(zhǔn)備3s后查找關(guān)鍵詞:\t" + ki)
time.sleep(3)
getKeySearch(ki)
browser.quit()
print("\033[0;32;40m\t 操作完成\033[0m ")
程序運(yùn)行界面如下圖: