php提取網(wǎng)頁(yè)正文內(nèi)容的例子

當(dāng)前位置：首頁(yè) > 范文|應(yīng)用文 > IT技術(shù)專欄 > 網(wǎng)絡(luò)編程

來(lái)源：易賢網(wǎng) 閱讀：719 次日期：2014-12-03 09:44:48

溫馨提示：易賢網(wǎng)小編為您整理了“php提取網(wǎng)頁(yè)正文內(nèi)容的例子”,方便廣大網(wǎng)友查閱！

因?yàn)殡y點(diǎn)在于如何去識(shí)別并保留網(wǎng)頁(yè)中的文章部分，而且刪除其它無(wú)用的信息，并且要做到通用化，不能像火車頭那樣根據(jù)目標(biāo)站來(lái)制定采集規(guī)則，因?yàn)樗阉饕娼Y(jié)果中有各種的網(wǎng)頁(yè)。

抓回一個(gè)頁(yè)面的數(shù)據(jù)，如何匹配出正文部分，鄭曉在下班路上想了個(gè)思路是：

1. 提取出body標(biāo)簽部分–>剔除所有鏈接–>剔除所有script、注釋–>剔除所有空白標(biāo)簽(包括標(biāo)簽內(nèi)不含中文的)–>獲取結(jié)果。

2. 直接匹配出非鏈接的、符合在div、p、h標(biāo)簽中的中文部分???

還是會(huì)有不少其它多余信息啊，比如底部信息等。。如何搞?不知道大家有木有什么思路或建議?

這個(gè)類是從網(wǎng)上找到的一個(gè)php實(shí)現(xiàn)的提取網(wǎng)頁(yè)正文部分的算法，鄭曉在本地也測(cè)試了下，準(zhǔn)確率非常高。

代碼如下:

<?php

class Readability {

// 保存判定結(jié)果的標(biāo)記位名稱

const ATTR_CONTENT_SCORE = "contentScore";

// DOM 解析類目前只支持 UTF-8 編碼

const DOM_DEFAULT_CHARSET = "utf-8";

// 當(dāng)判定失敗時(shí)顯示的內(nèi)容

const MESSAGE_CAN_NOT_GET = "Readability was unable to parse this page for content.";

// DOM 解析類（PHP5 已內(nèi)置）

protected $DOM = null;

// 需要解析的源代碼

protected $source = "";

// 章節(jié)的父元素列表

private $parentNodes = array();

// 需要?jiǎng)h除的標(biāo)簽

// Note: added extra tags from

private $junkTags = Array("style", "form", "iframe", "script", "button", "input", "textarea",

"noscript", "select", "option", "object", "applet", "basefont",

"bgsound", "blink", "canvas", "command", "menu", "nav", "datalist",

"embed", "frame", "frameset", "keygen", "label", "marquee", "link");

// 需要?jiǎng)h除的屬性

private $junkAttrs = Array("style", "class", "onclick", "onmouseover", "align", "border", "margin");

/**

* 構(gòu)造函數(shù)

* @param $input_char 字符串的編碼。默認(rèn) utf-8，可以省略

function __construct($source, $input_char = "utf-8") {

$this->source = $source;

// DOM 解析類只能處理 UTF-8 格式的字符

$source = mb_convert_encoding($source, 'HTML-ENTITIES', $input_char);

// 預(yù)處理 HTML 標(biāo)簽，剔除冗余的標(biāo)簽等

$source = $this->preparSource($source);

// 生成 DOM 解析類

$this->DOM = new DOMDocument('1.0', $input_char);

try {

//libxml_use_internal_errors(true);

// 會(huì)有些錯(cuò)誤信息，不過(guò)不要緊 :^)

if ( encoding="'.Readability::DOM_DEFAULT_CHARSET.'">'.$source)) {

throw new Exception("Parse HTML Error!");

}

foreach ($this->DOM->childNodes as $item) {

if ($item->nodeType == XML_PI_NODE) {

$this->DOM->removeChild($item); // remove hack

}

// insert proper

$this->DOM->encoding = Readability::DOM_DEFAULT_CHARSET;

} catch (Exception $e) {

// ...

}

/**

* 預(yù)處理 HTML 標(biāo)簽，使其能夠準(zhǔn)確被 DOM 解析類處理

* @return String

private function preparSource($string) {

// 剔除多余的 HTML 編碼標(biāo)記，避免解析出錯(cuò)

preg_match("/charset=([＼w|＼-]+);?/", $string, $match);

if (isset($match[1])) {

$string = preg_replace("/charset=([＼w|＼-]+);?/", "", $string, 1);

}

// Replace all doubled-up <BR> tags with <P> tags, and remove fonts.

$string = preg_replace("/<br＼/?>[ ＼r＼n＼s]*<br＼/?>/i", "</p><p>", $string);

$string = preg_replace("/<＼/?font[^>]*>/i", "", $string);

// @see

// - from

$string = preg_replace("#<script(.*?)>(.*?)</script>#is", "", $string);

return trim($string);

}

/**

* 刪除 DOM 元素中所有的 $TagName 標(biāo)簽

* @return DOMDocument

private function removeJunkTag($RootNode, $TagName) {

$Tags = $RootNode->getElementsByTagName($TagName);

//Note: always index 0, because removing a tag removes it from the results as well.

while($Tag = $Tags->item(0)){

$parentNode = $Tag->parentNode;

$parentNode->removeChild($Tag);

}

return $RootNode;

}

/**

* 刪除元素中所有不需要的屬性

private function removeJunkAttr($RootNode, $Attr) {

$Tags = $RootNode->getElementsByTagName("*");

$i = 0;

while($Tag = $Tags->item($i++)) {

$Tag->removeAttribute($Attr);

}

return $RootNode;

}

/**

* 根據(jù)評(píng)分獲取頁(yè)面主要內(nèi)容的盒模型

* 判定算法來(lái)自：

* 這里由鄭曉博客轉(zhuǎn)發(fā)

* @return DOMNode

private function getTopBox() {

// 獲得頁(yè)面所有的章節(jié)

$allParagraphs = $this->DOM->getElementsByTagName("p");

// Study all the paragraphs and find the chunk that has the best score.

// A score is determined by things like: Number of <p>'s, commas, special classes, etc.

$i = 0;

while($paragraph = $allParagraphs->item($i++)) {

$parentNode = $paragraph->parentNode;

$contentScore = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE));

$className = $parentNode->getAttribute("class");

$id = $parentNode->getAttribute("id");

// Look for a special classname

if (preg_match("/(comment|meta|footer|footnote)/i", $className)) {

$contentScore -= 50;

} else if(preg_match(

$className)) {

$contentScore += 25;

}

// Look for a special ID

if (preg_match("/(comment|meta|footer|footnote)/i", $id)) {

$contentScore -= 50;

} else if (preg_match(

$id)) {

$contentScore += 25;

}

// Add a point for the paragraph found

// Add points for any commas within this paragraph

if (strlen($paragraph->nodeValue) > 10) {

$contentScore += strlen($paragraph->nodeValue);

}

// 保存父元素的判定得分

$parentNode->setAttribute(Readability::ATTR_CONTENT_SCORE, $contentScore);

// 保存章節(jié)的父元素，以便下次快速獲取

array_push($this->parentNodes, $parentNode);

}

$topBox = null;

// Assignment from index for performance.

// See

for ($i = 0, $len = sizeof($this->parentNodes); $i < $len; $i++) {

$parentNode = $this->parentNodes[$i];

$contentScore = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE));

$orgContentScore = intval($topBox ? $topBox->getAttribute(Readability::ATTR_CONTENT_SCORE) : 0);

if ($contentScore && $contentScore > $orgContentScore) {

$topBox = $parentNode;

}

// 此時(shí)，$topBox 應(yīng)為已經(jīng)判定后的頁(yè)面內(nèi)容主元素

return $topBox;

}

/**

* 獲取 HTML 頁(yè)面標(biāo)題

* @return String

public function getTitle() {

$split_point = ' - ';

$titleNodes = $this->DOM->getElementsByTagName("title");

if ($titleNodes->length

&& $titleNode = $titleNodes->item(0)) {

// @see

$title = trim($titleNode->nodeValue);

$result = array_map('strrev', explode($split_point, strrev($title)));

return sizeof($result) > 1 ? array_pop($result) : $title;

}

return null;

}

/**

* Get Leading Image Url

* @return String

public function getLeadImageUrl($node) {

$images = $node->getElementsByTagName("img");

if ($images->length && $leadImage = $images->item(0)) {

return $leadImage->getAttribute("src");

}

return null;

}

/**

* 獲取頁(yè)面的主要內(nèi)容（Readability 以后的內(nèi)容）

* @return Array

public function getContent() {

if (!$this->DOM) return false;

// 獲取頁(yè)面標(biāo)題

$ContentTitle = $this->getTitle();

// 獲取頁(yè)面主內(nèi)容

$ContentBox = $this->getTopBox();

//Check if we found a suitable top-box.

if($ContentBox === null)

throw new RuntimeException(Readability::MESSAGE_CAN_NOT_GET);

// 復(fù)制內(nèi)容到新的 DOMDocument

$Target = new DOMDocument;

$Target->appendChild($Target->importNode($ContentBox, true));

// 刪除不需要的標(biāo)簽

foreach ($this->junkTags as $tag) {

$Target = $this->removeJunkTag($Target, $tag);

}

// 刪除不需要的屬性

foreach ($this->junkAttrs as $attr) {

$Target = $this->removeJunkAttr($Target, $attr);

}

$content = mb_convert_encoding($Target->saveHTML(), Readability::DOM_DEFAULT_CHARSET, "HTML-ENTITIES");

// 多個(gè)數(shù)據(jù)，以數(shù)組的形式返回

return Array(

'lead_image_url' => $this->getLeadImageUrl($Target),

'word_count' => mb_strlen(strip_tags($content), Readability::DOM_DEFAULT_CHARSET),

'title' => $ContentTitle ? $ContentTitle : null,

'content' => $content

);

}

function __destruct() { }

}

使用起來(lái)也非常簡(jiǎn)單，實(shí)例化時(shí)傳入網(wǎng)頁(yè)的html源碼和相應(yīng)的編碼，然后直接調(diào)用其getContent方法即可返回提取到的正文部分，提取出的文章中可能還會(huì)含有少部分鏈接，可以自己后期再修改

更多信息請(qǐng)查看IT技術(shù)專欄

更多信息請(qǐng)查看網(wǎng)絡(luò)編程

上一篇：一些常用的ADO記錄集進(jìn)行管理

下一篇：為ckeditor編輯器修改添加一鍵排版功能

易賢網(wǎng)手機(jī)網(wǎng)站地址：php提取網(wǎng)頁(yè)正文內(nèi)容的例子

由于各方面情況的不斷調(diào)整與變化，易賢網(wǎng)提供的所有考試信息和咨詢回復(fù)僅供參考，敬請(qǐng)考生以權(quán)威部門公布的正式信息和咨詢?yōu)闇?zhǔn)！

相關(guān)閱讀網(wǎng)絡(luò)編程

Shell中如何刪除文本比較長(zhǎng)的行的實(shí)現(xiàn)方法10月30日

vue.js語(yǔ)法及常用指令10月30日

python 讀寫中文json的實(shí)例詳解10月30日

Objective-C Json 實(shí)例詳解10月30日

bootstrap table sum總數(shù)量統(tǒng)計(jì)實(shí)現(xiàn)方法10月30日

python生成二維碼的實(shí)例詳解10月30日

Python批量更改文件名的實(shí)現(xiàn)方法10月30日

解決出現(xiàn)Incorrect integer value的問(wèn)題10月30日

jQuery實(shí)現(xiàn)切換隱藏與顯示同時(shí)切換圖標(biāo)功能10月30日

docker python api 安裝配置的詳解10月30日

javascript按鈕禁用和啟用的效果實(shí)例代碼10月30日

vue.js todolist實(shí)現(xiàn)代碼10月30日

vue.js 父向子組件傳參的實(shí)例代碼10月30日

apache 開啟重定向 rewrite的實(shí)現(xiàn)方法10月30日

Vue.js劃分組件的方法10月30日

python logging日志模塊的詳解10月30日

vue中的scope使用詳解10月30日

docker cgroup 資源監(jiān)控的詳解10月30日

使用Android Studio 開發(fā)自己的SDK教程10月23日

linux系統(tǒng)下MongoDB單節(jié)點(diǎn)安裝教程10月23日

易賢網(wǎng)移動(dòng)網(wǎng)站

2026上岸·考公考編培訓(xùn)報(bào)班

報(bào)班類型
姓名
手機(jī)號(hào)
驗(yàn)證碼

茄子在线看片免费人成视频,午夜福利精品a在线观看,国产高清自产拍在线观看,久久综合久久狠狠综合