最近采集土豆,发现土豆的页面貌似用了gzip,当使用file_get_contents()获取得到的是一些乱码,当时试着将页面的编码改来改去,但还是不行,想到了以前碰到类似的情况,好吧,果然。
分析:
大概有2种情况,1是自身的页面编码,和获取的页面编码不一致。
另外的一种情况就是,目标页面用了gzip技术。
用以下的函数可解决这个问题:
<?php function gzdecode($data) { $len = strlen ( $data ); if ($len < 18 || strcmp ( substr ( $data, 0, 2 ), "\x1f\x8b" )) { return null; // Not GZIP format (See RFC 1952) } $method = ord ( substr ( $data, 2, 1 ) ); // Compression method $flags = ord ( substr ( $data, 3, 1 ) ); // Flags if ($flags & 31 != $flags) { // Reserved bits are set -- NOT ALLOWED by RFC 1952 return null; } // NOTE: $mtime may be negative (PHP integer limitations) $mtime = unpack ( "V", substr ( $data, 4, 4 ) ); $mtime = $mtime [1]; $xfl = substr ( $data, 8, 1 ); $os = substr ( $data, 8, 1 ); $headerlen = 10; $extralen = 0; $extra = ""; if ($flags & 4) { // 2-byte length prefixed EXTRA data in header if ($len - $headerlen - 2 < 8) { return false; // Invalid format } $extralen = unpack ( "v", substr ( $data, 8, 2 ) ); $extralen = $extralen [1]; if ($len - $headerlen - 2 - $extralen < 8) { return false; // Invalid format } $extra = substr ( $data, 10, $extralen ); $headerlen += 2 + $extralen; } $filenamelen = 0; $filename = ""; if ($flags & 8) { // C-style string file NAME data in header if ($len - $headerlen - 1 < 8) { return false; // Invalid format } $filenamelen = strpos ( substr ( $data, 8 + $extralen ), chr ( 0 ) ); if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { return false; // Invalid format } $filename = substr ( $data, $headerlen, $filenamelen ); $headerlen += $filenamelen + 1; } $commentlen = 0; $comment = ""; if ($flags & 16) { // C-style string COMMENT data in header if ($len - $headerlen - 1 < 8) { return false; // Invalid format } $commentlen = strpos ( substr ( $data, 8 + $extralen + $filenamelen ), chr ( 0 ) ); if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { return false; // Invalid header format } $comment = substr ( $data, $headerlen, $commentlen ); $headerlen += $commentlen + 1; } $headercrc = ""; if ($flags & 1) { // 2-bytes (lowest order) of CRC32 on header present if ($len - $headerlen - 2 < 8) { return false; // Invalid format } $calccrc = crc32 ( substr ( $data, 0, $headerlen ) ) & 0xffff; $headercrc = unpack ( "v", substr ( $data, $headerlen, 2 ) ); $headercrc = $headercrc [1]; if ($headercrc != $calccrc) { return false; // Bad header CRC } $headerlen += 2; } // GZIP FOOTER - These be negative due to PHP's limitations $datacrc = unpack ( "V", substr ( $data, - 8, 4 ) ); $datacrc = $datacrc [1]; $isize = unpack ( "V", substr ( $data, - 4 ) ); $isize = $isize [1]; // Perform the decompression: $bodylen = $len - $headerlen - 8; if ($bodylen < 1) { // This should never happen - IMPLEMENTATION BUG! return null; } $body = substr ( $data, $headerlen, $bodylen ); $data = ""; if ($bodylen > 0) { switch ($method) { case 8 : // Currently the only supported compression method: $data = gzinflate ( $body ); break; default : // Unknown compression method return false; } } else { // I'm not sure if zero-byte body content is allowed. // Allow it for now... Do nothing... } // Verifiy decompressed size and CRC32: // NOTE: This may fail with large data sizes depending on how // PHP's integer limitations affect strlen() since $isize // may be negative for large sizes. if ($isize != strlen ( $data ) || crc32 ( $data ) != $datacrc) { // Bad format! Length or CRC doesn't match! return false; } return $data; } ?>