欧美成人影院,国产精品一线,亚洲国产不卡,亚洲第一网站,欧美福利视频,99精品视频在线,日韩欧美人妻,亚洲日韩高清无码

查看詳情

基于python實現(xiàn)多線程分頁采集網(wǎng)站段落內(nèi)容的腳本工具

江西居道科技有限公司主營業(yè)務(wù)包含網(wǎng)站建設(shè),APP開發(fā),小程序開發(fā),網(wǎng)絡(luò)推廣,SEO優(yōu)化,網(wǎng)編人員免不了要幫客戶進行一些網(wǎng)站維護操作,但是,各行各業(yè)特性不同,我們同事每次幫客戶維護網(wǎng)站時,都需要獲取大量的素材,圖片性質(zhì)的素材倒是好辦,直接上百度圖片上去找,但是,文字內(nèi)容就不好弄了。

我們的網(wǎng)絡(luò)編輯人員以往都是去一些客戶同行網(wǎng)站上搜集相關(guān)的素材,然后稍加整理,但是,這種方式需要消耗大量的人力,而且沒什么技巧可言,純粹是人工操作;有鑒于此,我們程序開發(fā)人員采用python寫了一個多線程分頁采集網(wǎng)站段落內(nèi)容的腳本工具,根據(jù)設(shè)定好的參數(shù)自動對指定網(wǎng)站進行采集,提取網(wǎng)站上的段落內(nèi)容,并將內(nèi)容保存到本機,現(xiàn)公布相關(guān)代碼,方便大家使用,轉(zhuǎn)載請注明出處!


#!/usr/bin/python
import json
import os
import requests
import threading
import re
import time
import sys
import colorama

colorama.init(autoreset=True)

#打開文件
with open('config.json','r') as f:
    data = json.load(f)
    f.close()

	
def toInt(num):
	if num !='':
		return int(num)
	else:
		return 0

thead_count = 0    #待結(jié)束的進程數(shù)
start_ = toInt(data['start'])	#分頁起始值
end_ = toInt(data['end'])   #分頁結(jié)束值
url_ = data['url']   #入口地址
urlinclude = data['urlinclude']   #URL必須包含的字符
urlunclude = data['urlunclude'] #URL不能包含的字符
textinclude = data['textinclude'] #內(nèi)容中必須包含的內(nèi)容
textunclude = data['textunclude'] #內(nèi)容中不能包含的字符
textreplace = data['textreplace'] #需要過濾的字符
textminsize = toInt(data['textminsize']) #有效段落的最少字符數(shù)
textmaxsize = toInt(data['textmaxsize']) #有效段落的最大字符數(shù)
encoding_ = data['encoding']	#頁面編碼
starttag = data['starttag']	#內(nèi)容提取開始字符
endtag = data['endtag']	#內(nèi)容提取結(jié)束字符
sleepTime = toInt(data['sleep'])	#每次請求間隔
jsonkey = data['jsonkey']	#JSON格式數(shù)據(jù)返回時的字段
headers_ = data['headers'] #request請求主機頭參數(shù)
todayStr = time.strftime("%Y%m%d",time.localtime())
total = 0

if encoding_=='':
	encoding_ = 'utf-8'
    
#日志保存
def doLog(vstr):
    with open(todayStr + ".log",'a') as fo:
        if vstr !="":
            fo.writelines(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) + "\t" + vstr + "\n")
        else:
            fo.writelines(time.strftime("\n"))
            
    fo.close()

def saveText(vstr):
	global total
	if vstr !='':
		#doLog('需要保存的內(nèi)容長度'+str(len(vstr)))
		#判斷不允許包含的內(nèi)容
		if len(textunclude)>0:
			for tu_ in textunclude:
				if tu_!='':
					if vstr.find(tu_) !=-1:
						#doLog(vstr + "】中存在不允許的字符:" + tu_)
						return ""

		#處理替換內(nèi)容
		if len(textreplace)>0:
			for vi in textreplace:
				if vi!='':
					vstr = vstr.replace(vi,'')

		print("\033[0;32;40m\t 收集的內(nèi)容長度:" + str(len(vstr)) + "\t\033[0m ")
		total = total +1
		with open("采集結(jié)果.txt",'a') as fo:
			fo.writelines(vstr+"\n")

		fo.close()

def getFromUrl(vurl):
	global thead_count,start_time
	if vurl !='':
		print('即將從' + vurl + '頁面獲取可用鏈接')
		#提取主網(wǎng)址
		domain = ""
		if vurl.find('://') !=-1:
			domain = vurl[0:vurl.find('/',vurl.find('://')+4)]
		else:
			domain = vurl[0:vurl.find('/')]

		res = requests.get(vurl,"",headers=headers_ if len(headers_)>0 else {},timeout=3)

		if jsonkey !='':
			_json = res.json()
			if _json[jsonkey] !='':
				htmlStr = _json[jsonkey]
			else:
				print("================== ERROR ===================")

		else:
			htmlStr = res.text

		#print(htmlStr)
		a_href =re.findall('<a.*?href="(.*?)".*?',htmlStr,re.I) #利用正則,提取所有a鏈接
		a_href = set(a_href) #過渡重復(fù)的鏈接
		for i in a_href:
			urlFlag = 1
			#對鏈接進行有效性判斷,先判斷不能包含的字符
			if len(urlunclude)>0:
				for u1 in urlunclude:
					if i.find(u1) !=-1:
						urlFlag = 0
						print("\033[0;31;40m\t" + i + "\t無效\033[0m ")
						break

			#判斷必須包含的內(nèi)容
			if urlFlag>0 and len(urlinclude)>0:
				inFlag = 0
				for u2 in urlinclude:
					if i.find(u2) !=-1:
						inFlag = 1
						break

				if inFlag<1:
					urlFlag=0 #不存在指定內(nèi)容,視為無效

			if urlFlag:
				#URL有效
				if i[0:1] =='/':
					i = domain + i	#相對目錄,補齊路徑

				#提取內(nèi)容
				if sleepTime>0:
					print('延時' + str(sleepTime) + '秒后開始采集')
					time.sleep(sleepTime)

				doLog('開始采集:' + i)
				res2 = requests.get(i,"",headers=headers_ if len(headers_)>0 else {},timeout=3)
				html_ = res2.text
				if html_ !='':
					htmlFlag = 1

					#判斷是否包含指定內(nèi)容
					if len(textinclude)>0:
						if html_.find(textinclude) !=-1:
							htmlFlag = 1
						else:
							htmlFlag = 0

					if htmlFlag<1:
						print(i + "\t不存在特定內(nèi)容,視為無效!")
					else:
						if starttag!="" or endtag!="":
							_startpos = 0
							_endpos = len(html_)
							if starttag!="":
								_startpos = html_.find(starttag)

							if endtag!="":
								_endpos = html_.find(endtag,_startpos)

							if _startpos>= _endpos:
								_endpos = len(html_)

							#根據(jù)標簽,提取內(nèi)容
							html_ = html_[_startpos:_endpos]

						#過濾掉html代碼,提取純中文
						html_ = html_.replace('</p>',"</p>\r\n")	#避免整段HTML代碼都沒換行
						html_ = re.sub(r'</?\w+[^>]*>','',html_)
						#doLog(i + ':' + html_)
						#對內(nèi)容進行分割
						tmpArr = html_.split("\r\n")
						for ti in tmpArr:
							ti2 = ti.strip().replace("  "," ")
							if len(ti2)>textminsize and len(ti2)<textmaxsize:
								#doLog(i + ':' + ti)
								saveText(ti2)
							else:
								if len(ti2)>textmaxsize:
									print(i + '的內(nèi)容長度為:' + str(len(ti2)))
									#內(nèi)容過長,嘗試再次分段
									arr2 = ti2.replace("\r","\n").split("\n")
									for tj in arr2:
										tj2 = tj.strip().replace("  "," ")
										print('當前段落長度為:' + str(len(tj2)))
										if len(tj2)>textminsize and len(tj2)<textmaxsize:
											saveText(tj2)
										#else:
										#	if len(tj2)>textmaxsize:
										#		doLog(i + '-->' + tj2)
								else:
									print('段落不符合設(shè)定要求' + str(len(ti2)))

				print(i)

		print(vurl + " 采集完成,退出線程\n")
		if thead_count==1:
			print('任務(wù)已完成,共用時:'+str(formatFloat(time.time()-start_time)) + 's')
			print('共計:' + str(total))
			#退出整個程序
			sys.exit()
		else:
			if thead_count>0:
				thead_count -= 1

	else:
		if thead_count>0:
			thead_count -= 1

print("程序成功啟動")

if start_<1:
	start_ = 1

if end_<start_:
	end_=start_

thread_list = []
start_time = time.time()
print('江西居道科技有限公司為您提供技術(shù)服務(wù),www.xhjnt.cn,轉(zhuǎn)載請注明出處')

if url_.find('[pageindex]') !=-1:
	#循環(huán)
	for ui in range(start_,end_+1):
		_url_ = url_.replace('[pageindex]',str(ui))
		myThread = threading.Thread(target=getFromUrl,args=(_url_,))
		thead_count += 1
		thread_list.append(myThread);

	for tl in thread_list:
		tl.start()

		if sleepTime>0:
			print('延時' + str(sleepTime) + '秒后繼續(xù)')
			time.sleep(sleepTime)
		
		#doLog("啟動一個進程");

else:
	getFromUrl(url_)


此外,還需要一個config.json配置文件,用來設(shè)定一些參數(shù)信息,代碼如下:

{"start":1,"end":2,"url":"http://www.xhjnt.cn/articleslist.html","urlinclude":["jsruixi/vip_doc"],"urlunclude":[],"textinclude":"</h1>","textunclude":["___","www.","://"],"textreplace":["南京","1、","2、","3、","4、","5、","6、","7、","8、","9、","①、","①.","②、","②.","③、","③.","④、","④.","⑤、","⑤.","⑥、","⑥.","⑦、","⑦.","⑧、","⑧.","⑨、","⑨.","⑩、","⑩.","⑴、","⑴.","⑵、","⑵.","⑶、","⑶.","⑷、","⑷.","⑸、","⑸.","⑹、","⑹.","⑺、","⑺.","⑻、","⑻.","⑼、","⑼.","⑽、","⑽.","一、","一.","二、","二.","三、","三.","四、","四.","五、","五.","六、","六.","七、","七.","八、","八.","九、","九.","十、","十.","1)、","1).","2)、","2).","3)、","3).","4)、","4).","5)、","5).","6)、","6).","7)、","7).","8)、","8).","①","⑴","1)","②","⑵","2)","③","⑶","3)","④","⑷","4)","⑤","⑸","5)","⑥","⑹","6)","⑦","⑺","7)","⑧","⑻","8)","⑨","⑼","⑩","⑽","(1)","(2)","(3)","(4)","(5)","(6)","(7)","(8)","(9)","(10)"],"textminsize":100,"textmaxsize":300,"encoding":"utf-8","starttag":"</h1>","endtag":"<div class=\"p-details-pre-nex\" id=\"pDetailsPreNext\">","sleep":3,"jsonkey":"","headers":{"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3","Accept - Encoding":"gzip, deflate, br","Accept-Language":"zh-CN,zh;q=0.9","Connection":"Keep-Alive","Host":"www.xhjnt.cn","User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}}


相關(guān)腳本提供下載,點擊下載

運行效果如下圖:
基于python實現(xiàn)多線程分頁采集網(wǎng)站段落內(nèi)容的腳本工具運行效果


原創(chuàng)內(nèi)容,轉(zhuǎn)載請注明出處:網(wǎng)站建設(shè),APP開發(fā),小程序開發(fā)請找江西居道科技有限公司,http://www.xhjnt.cn

智能建站系統(tǒng)代理招商
所屬分類:文章中心??????Time:2020-10-14 01:19:15??????人氣:663
關(guān)閉
13517086454
汉中市| 漠河县| 鹿邑县| 台中市| 辛集市| 瑞安市| 棋牌| 聂拉木县| 沾益县| 静安区| 德阳市| 东辽县| 万宁市| 丹寨县| 桐梓县| 江安县| 扎赉特旗| 祥云县| 宝坻区| 凤庆县| 临海市| 丰宁| 获嘉县| 昌平区| 响水县| 邢台市| 孝感市| 农安县| 高青县| 宜宾市| 江油市| 车险| 兴化市| 赤壁市| 兴隆县| 连城县| 长寿区| 浦县| 北海市| 图们市| 宁乡县|