TidyXml.class.php 10 KB
<?php

/** 
 * 
 * Xml(包括html)处理类.
 * 
 * 此类设计相关模块:
 * 
 * 1. tidy模块. 具体安装步骤请参见http://hibox.yoho.cn:8080/drupal/node/54. windows下安装请google
 * 2. xsl模块. 具体安装步骤请参见http://hibox.yoho.cn:8080/drupal/node/54. windows下安装请google
 * 
 * PHP版本要求: 5.0+
 * 
 * @author dan
 */
class Util_Common_Xml_TidyXml
{
    private $tidy = null;
    
    private $xmlDom = null;
    
    private $xslProcessor = null;
        
    public function __construct()
    {
        $this->tidy = new tidy();
        $this->xmlDom = new DOMDocument;
        $this->xslProcessor = new XSLTProcessor;
    }
    
    /**
     * 装载字符串类型的xml文件, 使之成为dom对象.
     * 
     * 装载后, 可以调用<code>getXmlDom()</code>方法获取DOM对象进行DOM操作
     * 
     * @param String $xml
     */
    public function loadXml($xml)
    {
        $this->xmlDom->loadXML($xml);
    }
    
    /**
     * 装载字符串类型的html文件, 使之成为dom对象.
     * 
     * 装载后, 可以调用<code>getXmlDom()</code>方法获取DOM对象进行DOM操作
     * 
     * @param String $html
     */
    public function loadHtml($html)
    {
        $this->xmlDom->loadHTML($html);
    }
    
    /**
     * 返回DOM对象.
     * 
     */
    public function getXmlDom()
    {
        return $this->xmlDom;
    }
    
    /**
     * 获取一个tidy对象. tidy对象在初始化时就已被初始化好.
     * 
     */
    public function getTidy()
    {
        return $this->tidy;
    }
    
    /**
     * 将HTML字符串类型转化为XHTML作为字符串类型返回.
     * 
     * 将分两步转化:
     * 1. 清理
     * 2. 转化
     * 
     * @param String $htmlString 需要转换的html字符串类型
     * @param array $parseConfig 转换的配置参数. @see http://tidy.sourceforge.net/docs/quickref.html
     * @param String $encode 
     *        The encoding parameter sets the encoding for input/output documents. 
     *        The possible values for encoding are: ascii, latin0, latin1, raw, utf8, iso2022, mac, 
     *                                               win1252, ibm858, utf16, utf16le, utf16be, big5, and shiftjis. 
     * @param array $repairConfig 修正(清理)html文件的配置参数. @see http://tidy.sourceforge.net/docs/quickref.html
     */
    public function htmlStringToXhtml($htmlString, $parseConfig, $encode, $repairConfig)
    {
        $xHtml = '';
        
        $repairedHtml = $this->repairString($htmlString, $repairConfig, $encode);
        
        if (isset($repairedHtml) && !empty($repairedHtml))
        {
            $xHtml = $this->_htmlToXhtml($repairedHtml,$parseConfig, $encode);
        }
        
        return $xHtml;
    }
    
    /**
     * 将HTML文件转化为XHTML作为字符串类型返回.
     * 
     * 将分两步转化:
     * 1. 清理
     * 2. 转化
     * 
     * @param String $filePath 需要转换的html路径地址
     * @param array $parseConfig 转换的配置参数. @see http://tidy.sourceforge.net/docs/quickref.html
     * @param String $encode 
     *        The encoding parameter sets the encoding for input/output documents. 
     *        The possible values for encoding are: ascii, latin0, latin1, raw, utf8, iso2022, mac, 
     *                                               win1252, ibm858, utf16, utf16le, utf16be, big5, and shiftjis. 
     * @param array $repairConfig 修正(清理)html文件的配置参数. @see http://tidy.sourceforge.net/docs/quickref.html
     */
    public function htmlFileToXhtml($filePath, $parseConfig, $encode, $repairConfig)
    {
        $xHtml = '';
        
        $repairedHtml = $this->repairFile($filePath, $repairConfig, $encode);
        
        if (isset($repairedHtml) && !empty($repairedHtml))
        {
            $xHtml = $this->_htmlToXhtml($repairedHtml,$parseConfig, $encode);
        }
        
        return $xHtml;
    }
    
    /**
     * 将XML字符串类型转化为XML作为字符串类型返回.
     * 
     * 将分两步转化:
     * 1. 清理
     * 2. 转化
     * 
     * @param String $xmlString 需要转换的xml字符串类型
     * @param array $parseConfig 转换的配置参数. @see http://tidy.sourceforge.net/docs/quickref.html
     * @param String $encode 
     *        The encoding parameter sets the encoding for input/output documents. 
     *        The possible values for encoding are: ascii, latin0, latin1, raw, utf8, iso2022, mac, 
     *                                               win1252, ibm858, utf16, utf16le, utf16be, big5, and shiftjis. 
     * @param array $repairConfig 修正(清理)xml文件的配置参数. @see http://tidy.sourceforge.net/docs/quickref.html
     */
    public function xmlStringToXml($xmlString, $parseConfig, $encode, $repairConfig)
    {
        $xml = '';
        
        $repairedXml = $this->repairString($xmlString, $repairConfig, $encode);
        
        if (isset($repairedXml) && !empty($repairedXml))
        {
            $xml = $this->_xmlToXml($repairedXml,$parseConfig, $encode);
        }
        
        return $xml;
    }
    
    /**
     * 将XML文件转化为XML作为字符串类型返回.
     * 
     * 将分两步转化:
     * 1. 清理
     * 2. 转化
     * 
     * @param String $filePath 需要转换的html路径地址
     * @param array $parseConfig 转换的配置参数. @see http://tidy.sourceforge.net/docs/quickref.html
     * @param String $encode 
     *        The encoding parameter sets the encoding for input/output documents. 
     *        The possible values for encoding are: ascii, latin0, latin1, raw, utf8, iso2022, mac, 
     *                                               win1252, ibm858, utf16, utf16le, utf16be, big5, and shiftjis. 
     * @param array $repairConfig 修正(清理)xml文件的配置参数. @see http://tidy.sourceforge.net/docs/quickref.html
     */
    public function xmlFileToXml($filePath, $parseConfig, $encode, $repairConfig)
    {
        $xml = '';
        
        $repairedXml = $this->repairFile($filePath, $repairConfig, $encode);
        
        if (isset($repairedXml) && !empty($repairedXml))
        {
            $xml = $this->_xmlToXml($repairedXml,$parseConfig, $encode);
        }
        
        return $xml;
    }
    /**
     * 使用tidy修复文件. 包括不匹配的标签, 不完整的标签等
     * 
     * @param String $filePath
     * @param array $config
     * @param String $encode
     */
    public function repairFile($filePath, $config = array(), $encode = 'utf8')
    {
        return $this->tidy->repairFile($filePath, $config, $encode);
    }
    
    
    /**
     * 使用tidy修复字符串. 包括不匹配的标签, 不完整的标签等
     * 
     * @param String $string html(xml)字符串
     * @param array $config
     * @param String $encode
     */
    public function repairString($string, $config = array(), $encode = 'utf8')
    {
        return $this->tidy->repairString($string, $config, $encode);
    }
    
    /**
     * 获取xsl处理器对象
     * 
     */
    public function getXslProcessor()
    {
        return $this->xslProcessor;
    }
    
    /**
     * 设置xsl对象
     * 
     * @param XSLTProcessor $xslProcessor
     */
    public function setXslProcessor(XSLTProcessor $xslProcessor)
    {
        $this->xslProcessor = $xslProcessor;
    }
    
    /**
     * 将xml字符串转换到另一种格式输出. 使用xsl
     * 
     * 转换需要指定xsl路径
     * 
     * @param String $xmlString
     * @param String $xslPath xsl文件路径
     * @param boolean $isHtml 是否转换的对象为html, 默认为是
     * @param boolean $needClean 是否需要在转换前清理
     */
    public function transformByXsl($xmlString, $xslPath, $isHtml = TRUE, $needClean = FALSE)
    {
        $transformedResult = $xmlString;
        
        if ($needClean)
        {
            // 清理xml, 避免转换出错
            $xmlString = $this->repairString($xmlString);
        }
        
        if (is_string($xmlString) && !empty($xmlString) && is_file($xslPath))
        {
            $xsl = new DOMDocument;

            $xsl->load($xslPath);
            
            $this->xslProcessor->importStyleSheet($xsl);
            
            $domDoc = new DOMDocument;
            
            if ($isHtml)
            {
                $domDoc->loadHTML($xmlString);
            }
            else
            {
                $domDoc->loadXML($xmlString);
            }
            
            $transformedResult = $this->xslProcessor->transformToXML($domDoc);
        }
        
        return $transformedResult;
    }
    
    /**
     * 将HTML转换为XHTML函数
     * 
     * @param String $htmlString
     * @param array $config
     * @param String $encode
     */
    private function _htmlToXhtml($htmlString, $config = array(), $encode = 'utf8')
    {
        if (!isset($config) || empty($config))
        {
            $config = array('indent' => TRUE,
                            'output-xhtml' => TRUE,
                            'wrap' => 200);
        }
        // 强制输出xhtml
        $config['output-xhtml'] = TRUE;
        
        if (!isset($encode) || !is_string($encode))
        {
            $encode = 'utf8';
        }
        
        $tidyResult = tidy_parse_string($htmlString, $config, $encode);
        
        $tidyResult->cleanRepair();
        
        return tidy_get_output($tidyResult);
    }
    
    /**
     * 将HTML转换为XHTML函数
     * 
     * @param String $htmlString
     * @param array $config
     * @param String $encode
     */
    private function _xmlToXml($xmlString, $config = array(), $encode = 'utf8')
    {
        if (!isset($config) || empty($config))
        {
            $config = array('input-xml' => TRUE,
                            'output-xml' => TRUE,);
        }
        
        if (!isset($encode) || !is_string($encode))
        {
            $encode = 'utf8';
        }
        
        $tidyResult = tidy_parse_string($xmlString, $config, $encode);
        
        $tidyResult->cleanRepair();
        
        return tidy_get_output($tidyResult);
    }
}

?>