由于工作的原因,最近需要生成网站的sitemap.xml,谷歌百度了很多地方,没有发现并合适可用的代码,三思之后还是决定自己写吧!虽然可能写的有所缺陷,但是毕竟是认认真真写的,希望对一些后来者有所帮助......
1、为什么要自己写脚本生成sitemap.xml?
很多人会说,在网上有现成的工具,扫一下就可以了,没有必要自己写。是的,的确是这样的。但是假设我们的网站进行经常更新,那么是不是每次我都要手动更新sitemap呢。我很懒,那么,有没有更好的方案呢?肯定是有的,我是否可以起一个定时任务,每天晚上更新一次呢,此时脚本就有用武之地了
2、文档目录:
配置文件 - config/config.ini.php sitemap主文件 - SiteMap.class.php
3、主文件代码
* @version 1.0 */ namespace Maweibinguo\SiteMap; class SiteMap { const SCHEMA = 'http://www.sitemaps.org/schemas/sitemap/0.9'; /** * @var webUrlList * @access public */ public $webUrlList = array(); /** * @var siteMapList * @access public */ public $siteMapList = array(); /** * @var isUseCookie * @access public */ public $isUseCookie = false; /** * @var cookieFilePath * @access public */ public $cookieFilePath = ''; /** * @var xmlWriter * @access private */ private $_xmlWriter = ''; /** * init basic config * * @access public */ public function __construct() { $this->_xmlWriter = new \XMLWriter(); $result = $this->_enviromentTest(); } /** * test the enviroment for the script * * @access pirvate */ private function _enviromentTest() { $sapiType = \php_sapi_name (); if( strtolower($sapiType) != 'cli' ) { echo ' The Script Must Run In Command Lines ', "\r\n"; exit(); } } /** * load the configValue for genrating sitemap by configname * * @param string $configName * @return string $configValue * @access public */ public function loadConfig($configName) { /* init return value */ $configValue = ''; /* load config value */ $configPath = __DIR__ . '/config/config.ini.php'; if(file_exists( $configPath )) { require $configPath; } else { echo "Can not find config file", "\r\n"; exit(); } $configValue = $$configName; /* return config value */ return $configValue; } /** * generate sitemap.xml for the web * * @param siteMapList * @access public */ public function generateSiteMapXml($siteMapList) { /* init return result */ $result = false; if( !is_array($siteMapList) || count($siteMapList) <= 0 ) { echo 'The SiteMap Cotent Is Empty',"\r\n"; exit(); } /* check the parameter */ $siteMapPath = $this->loadConfig('SITEMAPPATH'); if(!file_exists($siteMapPath)) { $commandStr = "touch ${siteMapPath}"; exec($commandStr); } if( !is_writable($siteMapPath) ) { echo 'Is Not Writeable',"\r\n"; exit(); } $this->_xmlWriter->openURI($siteMapPath); $this->_xmlWriter->startDocument('1.0', 'UTF-8'); $this->_xmlWriter->setIndent(true); $this->_xmlWriter->startElement('urlset'); $this->_xmlWriter->writeAttribute('xmlns', self::SCHEMA); foreach($siteMapList as $siteMapKey => $siteMapItem) { $this->_xmlWriter->startElement('url'); $this->_xmlWriter->writeElement('loc',$siteMapItem['Url']); $this->_xmlWriter->writeElement('title',$siteMapItem['Title']); $changefreq = !empty($siteMapItem['ChangeFreq']) ? $siteMapItem['ChangeFreq'] : 'Daily'; $this->_xmlWriter->writeElement('changefreq',$changefreq); $priority = !empty($siteMapItem['Priority']) ? $siteMapItem['Priority'] : 0.5; $this->_xmlWriter->writeElement('priority',$priority); $this->_xmlWriter->endElement(); } $this->_xmlWriter->endElement(); /* return return */ return $result; } /** * start to send request to the target url, and get the reponse * * @param string $targetUrl * @return mixed $returnData * @access public */ public function sendRequest($url) { /* init return value */ $responseData = false; /* check the parameter */ if( !filter_var($url, FILTER_VALIDATE_URL) ) { return $responseData; } $connectTimeOut = $this->loadConfig('CURLOPT_CONNECTTIMEOUT'); if( $connectTimeOut === false ) { return $responseData; } $timeOut = $this->loadConfig('CURLOPT_TIMEOUT'); if( $timeOut === false ) { return $responseData; } $handle = curl_init(); curl_setopt($handle, CURLOPT_URL, $url); curl_setopt($handle, CURLOPT_HEADER, false); curl_setopt($handle, CURLOPT_AUTOREFERER, true); curl_setopt($handle, CURLOPT_RETURNTRANSFER , true); curl_setopt($handle, CURLOPT_CONNECTTIMEOUT, $connectTimeOut); curl_setopt($handle, CURLOPT_TIMEOUT, $timeOut); curl_setopt($handle, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)" ); $headersItem = array( 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection: Keep-Alive' ); curl_setopt($handle, CURLOPT_HTTPHEADER, $headersItem); curl_setopt($handle, CURLOPT_FOLLOWLOCATION, 1); $cookieList = $this->loadConfig('COOKIELIST'); $isUseCookie = $cookieList['IsUseCookie']; $cookieFilePath = $cookieList['CookiePath']; if($isUseCookie) { if(!file_exists($cookieFilePath)) { $touchCommand = " touch {$cookieFilePath} "; exec($touchCommand); } curl_setopt($handle, CURLOPT_COOKIEFILE, $cookieFilePath); curl_setopt($handle, CURLOPT_COOKIEJAR, $cookieFilePath); } $responseData = curl_exec($handle); $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE); if($httpCode != 200) { $responseData = false; } curl_close($handle); /* return response data */ return $responseData; } /** * get the sitemap content of the url, it contains url, title, priority, changefreq * * @param string $url * @access public */ public function generateSiteMapList($url) { $content = $this->sendRequest($url); if($content !== false) { $tagsList = $this->_parseContent($content, $url); $urlItem = $tagsList['UrlItem']; $title = $tagsList['Title']; $siteMapItem = array( 'Url' => trim($url), 'Title' => trim($title) ); $priority = $this->_calculatePriority($siteMapItem['Url']); $siteMapItem['Priority'] = $priority; $changefreq = $this->_calculateChangefreq($siteMapItem['Url']); $siteMapItem['ChangeFreq'] = $changefreq; $this->siteMapList[] = $siteMapItem; foreach($urlItem as $nextUrl) { if( !in_array($nextUrl, $this->webUrlList) ) { $skipUrlList = $this->loadConfig('SKIP_URLLIST'); foreach($skipUrlList as $keyWords) { if( stripos($nextUrl, $keyWords) !== false ) { continue 2; } } $this->webUrlList[] = $nextUrl; echo $nextUrl,"\r\n"; $this->generateSiteMapList($nextUrl); } } } } /** *teChangefreq get sitemaplist of the web * * @access public * @return array $siteMapList */ public function getSiteMapList() { return $this->siteMapList; } /** * calate the priority of the targeturl * * @param string $targetUrl * @return float $priority * @access private */ private function _calculatePriority($targetUrl) { /* init priority */ $priority = 0.5; /* calculate the priority */ if( filter_var($targetUrl, FILTER_VALIDATE_URL) ) { $priorityList = $this->loadConfig('PRIORITYLIST'); foreach($priorityList as $priorityKey => $priorityValue) { if(stripos($targetUrl, $priorityKey) !== false) { $priority = $priorityValue; break; } } } /* return priority */ return $priority; } /** * calate the changefreq of the targeturl * * @param string $targetUrl * @return float $changefreq * @access private */ private function _calculateChangefreq($targetUrl) { /* init changefreq*/ $changefreq = 'Daily'; /* calculate the priority */ if( filter_var($targetUrl, FILTER_VALIDATE_URL) ) { $changefreqList = $this->loadConfig('CHANGEFREQLIST'); foreach($changefreqList as $changefreqKey => $changefreqValue) { if(stripos($targetUrl, $changefreqKey) !== false) { $changefreq = $changefreqValue; break; } } } /* return priority */ return $changefreq; } /** * format url * * @param $url * @param $orginUrl * @access private * @return $formatUrl */ private function _formatUrl($url, $originUrl) { /* init url */ $formatUrl = ''; /* format url */ if( !empty($url) && !empty($originUrl) ) { $badUrlItem = array( '\\', '/' , 'javascript', 'javascript:;', '' ); $formatUrl = trim($url); $formatUrl = trim($formatUrl, '#'); $formatUrl = trim($formatUrl, '\''); $formatUrl = trim($formatUrl, '"'); if(stripos($formatUrl, 'http') === false && !in_array($formatUrl, $badUrlItem)) { if(strpos($formatUrl, '/') === 0) { $domainName = $this->loadConfig('DOMAIN_NAME'); $formatUrl = $domainName . trim($formatUrl, '/'); } else { $formatUrl = substr( $originUrl, 0, strrpos($originUrl, '/') ) .'/'. $formatUrl; } } elseif( stripos($formatUrl, 'http') === false && in_array($formatUrl, $badUrlItem) ) { $formatUrl = ''; } } /* return url */ return $formatUrl; } /** * check domain is right * * @param $url * @return $url * @access private */ private function _checkDomain($url) { /* init url */ $result = false; /* check domain */ if($url) { $domainName = $this->loadConfig('DOMAIN_NAME'); if( stripos($url, $domainName) === false ) { return $result; } $result = true; } /* return url */ return $result; } /** * parse the response content, so that we can get the urls * * @param string $content * @param string $originUrl * @return array $urlItem * @access public */ public function _parseContent($content, $originUrl) { /* init return data */ $tagsList = array(); /* start parse */ if( !empty($content) && !empty($originUrl) ) { $domainName = $this->loadConfig('DOMAIN_NAME'); /* get the attribute of href for tags */ $regStrForTagA = '#<\s*a\s+href\s*=\s*(".*?"|\'.*?\')#um'; if( preg_match_all($regStrForTagA, $content, $matches) ) { $urlItem = array_unique($matches[1]); foreach($urlItem as $urlKey => $url) { $formatUrl = $this->_formatUrl($url, $originUrl); if( empty($formatUrl) ) { unset($urlItem[$urlKey]); continue; } $result = $this->_checkDomain($formatUrl); if($result === false) { unset($urlItem[$urlKey]); continue; } $urlItem[$urlKey] = $formatUrl; } } $tagsList['UrlItem'] = $urlItem; /* get the title tags content */ $regStrForTitle = '#<\s*title\s*>(.*?)<\s*\/\s*title\s*>#um'; if( preg_match($regStrForTitle, $content, $matches) ) { $title = $matches[1]; } $tagsList['Title'] = $title; } /* return tagsList */ return $tagsList; } } /* here is a example */ $startTime = microtime(true); echo "/***********************************************************************/","\r\n"; echo "/* start to run {$startTime} */","\r\n"; echo "/***********************************************************************/","\r\n\r\n"; $siteMap = new SiteMap(); $domain = $siteMap->loadConfig('DOMAIN_NAME'); $siteMap->generateSiteMapList($domain); $siteMapList = $siteMap->getSiteMapList(); $siteMap->generateSiteMapXml($siteMapList); $endTime = microtime(true); $takeTime = $endTime - $startTime; echo "/***********************************************************************/","\r\n"; echo "/* Had Done, \t it total take {$takeTime} */","\r\n"; echo "/***********************************************************************/","\r\n";?>
4、配置文件代码
true, 'CookiePath' => '/tmp/sitemapcookie' ); //sitemap文件的保存地址 $SITEMAPPATH = './sitemap.xml'; //根据连接关键字设置priority $PRIORITYLIST = array( 'product' => '0.8', 'device' => '0.6', 'intelligent' => '0.4', 'course' => '0.2' ); //根据连接关键字设置CHANGEFREQ $CHANGEFREQLIST = array( 'product' => 'Always', 'device' => 'Hourly', 'intelligent' => 'Daily', 'course' => 'Weekly', 'login' => 'Monthly', 'about' => 'Yearly' );?>