批量提取 sitemap.xml 中的链接生成 sitemap.txt (Python脚本)_前端开发者_网站前端开发

前端开发者_网站前端开发丨 前端开发

https://www.rokub.com

 

小站用了dedecms的一个插件,它能生成若干个 google sitemap 的 .xml 文件,然而却没有对应的利于百度收录的插件可使用。在网上查了一下,说是要提取这些 .xml 文件中的链接出来,生成一个对应的 sitemap.txt 才行,然后以如下格式放到 robot.txt 中

 

Sitemap: http://www.seozoe.com/sitemap.xml. http://www.seozoe.com/sitemap.txt

 

可是一个一个手动提取其中的链接太麻烦了,我用了20分钟写了一个 python 脚本,方便这个操作,当然程序还很简陋,但也足够我用了。

 

现在的功能是:从网站下载 sitemap.xml 文件,提取其中的链接生成 sitemap.txt 然后再传回网站,最后生成一个 robot.txt 作为参考。因为文件较多,我用了一个线程池,加快处理速度。
# http://www.seozoe.com/sitemap.txt
这个小脚本是在 linux 下写的,测试运行良好,我没有在 win32 平台下测试,但也应该能够工作。

 

[python] view plain copy
#!/usr/bin/envpython
#coding=utf-8
#############################################################
#
#
#
#
#
#############################################################
importre
importglob
importos
importurllib
importurlparse
fromftplibimportFTP
fromcommonimport*
p = re.compile(r”<loc>(.*)</loc>”,re.I)
sitemapname = ‘googlesitemap’
ftpdict = {‘server’:’218.95.37.111′,’port’:21,’user’:’testuser’,’pwd’:’littlesecret’}
def do_robot(filename,baseurl = ‘http://www.python8.org/’):
f = open(‘robot.txt’,’a’)
basefilename = os.path.splitext(os.path.split(filename)[1])[0]
webfilename0 = urlparse.urljoin(baseurl,basefilename+’.xml’)
webfilename1 = urlparse.urljoin(baseurl,basefilename+’.txt’)
f.write(webfilename0 + ‘. ‘ + webfilename1 + os.linesep)
f.close()
def uploadsitemaptxt(filename):
ftp = FTP()
ftp.connect(‘%(server)s’ % ftpdict,’%(port)d’ % ftpdict)
ftp.login(‘%(user)s’ % ftpdict,’%(pwd)s’ % ftpdict)
ftp.cwd(‘wwwroot/python8_org’)
bufsize = 1024
filehandle = open(filename,’rb’)
ftp.storbinary(‘STOR %s’ % filename,filehandle,bufsize)
filehandle.close()
ftp.quit()
def dositemap(filename,baseurl=’http://www.python8.org/’):
webfile = urlparse.urljoin(baseurl,filename)
urllib.urlretrieve(webfile,filename)
ifos.path.exists(filename):
sitemap2txt(filename)
def sitemap2txt(filename):
data = file(filename,’r’).read()
urlList = p.findall(data)
newfilename = os.path.splitext(os.path.split(filename)[1])[0] + ‘.txt’
urlList = (eachurl + os.linesepforeachurlinurlListifurlList)
file(newfilename,’w’).writelines(urlList)
ifos.path.exists(newfilename):
uploadsitemaptxt(newfilename)
printfilename.ljust(20),’ ==> ‘,newfilename.ljust(20),’OK!’
def cleaner():
fromglobimportglob
[os.remove(eachfile) for eachfile in glob(‘g*.xml’)]
[os.remove(eachfile) for eachfile in glob(‘g*.txt’)]
print’all temp file cleared’
def main():
print’Initializing, please wait…’
ifos.path.exists(‘robot.txt’):
os.remove(‘robot.txt’)
wm = WorkerManager(10) #开启10个线程同时处理,空间商很生气 😀
foreachinxrange(1,101,1):#我有100个 .xml脚本,文件名分别是googlesitemap_1.xml到googlesitemap_101.xml
eachfile = sitemapname + ‘_%s.xml’ % each
do_robot(eachfile)
wm.add_job(dositemap,eachfile)
wm.start()
wm.wait_for_complete()
cleaner() #清理文件
# http://www.seozoe.com/sitemap.txt
if __name__==’__main__’:
main()

 

脚本中引用的 common 模块中存放了一个线程池(抄来的 -O-),代码如下:
# http://www.seozoe.com/sitemap.txt

 

[python] view plain copy
importQueue,threading
importthreading
# workingthread
class Worker(threading.Thread):
worker_count = 0
def__init__( self,workQueue,resultQueue,timeout = 0,**kwds):
threading.Thread.__init__( self,**kwds )
self.id = Worker.worker_count
Worker.worker_count += 1
self.setDaemon( True )
self.workQueue = workQueue
self.resultQueue = resultQueue
self.timeout = timeout
self.thread_evt = threading.Event()
defstop(self):
self.thread_evt.set()
defrun( self ):
””’ the get-some-work, do-some-work main loop of worker threads ”’
whileTrue:
try:
callable,args,kwds = self.workQueue.get(timeout=self.timeout)
res = callable(*args,**kwds)
#print “worker[%2d]: %s” % (self.id, str(res) )
self.resultQueue.put( res )
# except Queue.Empty:
# break
except:
break
#print ‘worker[%2d]’ % self.id, sys.exc_info()[:2]
class WorkerManager:
def__init__( self,num_of_workers=10,timeout = 1):
self.workQueue = Queue.Queue()
self.resultQueue = Queue.Queue()
self.workers = []
self.timeout = timeout
self._recruitThreads( num_of_workers )
def_recruitThreads( self,num_of_workers ):
foriinrange( num_of_workers ):
worker = Worker( self.workQueue,self.resultQueue,self.timeout )
self.workers.append(worker)
defstop(self):
forwinself.workers:
w.stop()
defstart(self):
forwinself.workers:
w.start()
defwait_for_complete( self):
# …then, wait for each of them to terminate:
whilelen(self.workers):
worker = self.workers.pop()
worker.join( )
ifworker.isAlive() andnotself.workQueue.empty():
self.workers.append( worker )
# print”All jobs completed.”
defadd_job( self,callable,*args,**kwds ):
self.workQueue.put( (callable,args,kwds) )
defget_result( self,*args,**kwds ):
return self.resultQueue.get( * args, ** kwds)

 

最后生成的 http://www.seozoe.com/sitemap.txt 。

前端开发者_网站前端开发丨 前端开发

https://www.rokub.com

赞(0)
前端开发者 » 批量提取 sitemap.xml 中的链接生成 sitemap.txt (Python脚本)_前端开发者_网站前端开发
64K

评论 抢沙发

评论前必须登录!