gzip解码函数gzdecode()

最近采集土豆,发现土豆的页面貌似用了gzip,当使用file_get_contents()获取得到的是一些乱码,当时试着将页面的编码改来改去,但还是不行,想到了以前碰到类似的情况,好吧,果然。

分析: 大概有2种情况,1是自身的页面编码,和获取的页面编码不一致。 另外的一种情况就是,目标页面用了gzip技术。 用以下的函数可解决这个问题:

<?php

function gzdecode($data) {
    $len = strlen ( $data );
    if ($len < 18 || strcmp ( substr ( $data, 0, 2 ), "\x1f\x8b" )) {
        return null; // Not GZIP format (See RFC 1952) 
    }
    $method = ord ( substr ( $data, 2, 1 ) ); // Compression method 
    $flags = ord ( substr ( $data, 3, 1 ) ); // Flags 
    if ($flags & 31 != $flags) {
        // Reserved bits are set -- NOT ALLOWED by RFC 1952 
        return null;
    }
    // NOTE: $mtime may be negative (PHP integer limitations) 
    $mtime = unpack ( "V", substr ( $data, 4, 4 ) );
    $mtime = $mtime [1];
    $xfl = substr ( $data, 8, 1 );
    $os = substr ( $data, 8, 1 );
    $headerlen = 10;
    $extralen = 0;
    $extra = "";
    if ($flags & 4) {
        // 2-byte length prefixed EXTRA data in header 
        if ($len - $headerlen - 2 < 8) {
            return false; // Invalid format 
        }
        $extralen = unpack ( "v", substr ( $data, 8, 2 ) );
        $extralen = $extralen [1];
        if ($len - $headerlen - 2 - $extralen < 8) {
            return false; // Invalid format 
        }
        $extra = substr ( $data, 10, $extralen );
        $headerlen += 2 + $extralen;
    }

    $filenamelen = 0;
    $filename = "";
    if ($flags & 8) {
        // C-style string file NAME data in header 
        if ($len - $headerlen - 1 < 8) {
            return false; // Invalid format 
        }
        $filenamelen = strpos ( substr ( $data, 8 + $extralen ), chr ( 0 ) );
        if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
            return false; // Invalid format 
        }
        $filename = substr ( $data, $headerlen, $filenamelen );
        $headerlen += $filenamelen + 1;
    }

    $commentlen = 0;
    $comment = "";
    if ($flags & 16) {
        // C-style string COMMENT data in header 
        if ($len - $headerlen - 1 < 8) {
            return false; // Invalid format 
        }
        $commentlen = strpos ( substr ( $data, 8 + $extralen + $filenamelen ), chr ( 0 ) );
        if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
            return false; // Invalid header format 
        }
        $comment = substr ( $data, $headerlen, $commentlen );
        $headerlen += $commentlen + 1;
    }

    $headercrc = "";
    if ($flags & 1) {
        // 2-bytes (lowest order) of CRC32 on header present 
        if ($len - $headerlen - 2 < 8) {
            return false; // Invalid format 
        }
        $calccrc = crc32 ( substr ( $data, 0, $headerlen ) ) & 0xffff;
        $headercrc = unpack ( "v", substr ( $data, $headerlen, 2 ) );
        $headercrc = $headercrc [1];
        if ($headercrc != $calccrc) {
            return false; // Bad header CRC 
        }
        $headerlen += 2;
    }

    // GZIP FOOTER - These be negative due to PHP's limitations 
    $datacrc = unpack ( "V", substr ( $data, - 8, 4 ) );
    $datacrc = $datacrc [1];
    $isize = unpack ( "V", substr ( $data, - 4 ) );
    $isize = $isize [1];

    // Perform the decompression: 
    $bodylen = $len - $headerlen - 8;
    if ($bodylen < 1) {
        // This should never happen - IMPLEMENTATION BUG! 
        return null;
    }
    $body = substr ( $data, $headerlen, $bodylen );
    $data = "";
    if ($bodylen > 0) {
        switch ($method) {
            case 8 :
                // Currently the only supported compression method: 
                $data = gzinflate ( $body );
                break;
            default :
                // Unknown compression method 
                return false;
        }
    } else {

        // I'm not sure if zero-byte body content is allowed. 
    // Allow it for now...  Do nothing... 
    }

    // Verifiy decompressed size and CRC32: 
    // NOTE: This may fail with large data sizes depending on how 
    //       PHP's integer limitations affect strlen() since $isize 
    //       may be negative for large sizes. 
    if ($isize != strlen ( $data ) || crc32 ( $data ) != $datacrc) {
        // Bad format!  Length or CRC doesn't match! 
        return false;
    }
    return $data;
}

?>

Leave a Reply

(will not be published)