将html编码转为utf8
gopm get -g -v golang.org/x/text # 虽然也是官方库,但是没有包括在标准库里
将gbk转为utf8
如果原html就为utf-8
,那么转化后就会乱码
package main
import (
"net/http"
"io/ioutil"
"fmt"
"golang.org/x/text/transform"
"golang.org/x/text/encoding/simplifiedchinese"
)
func main(){
resp, err :=http.Get("https://www.zhenai.com/zhenghun")
if err!=nil{
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
fmt.Printf("Error Status Code: %d", resp.StatusCode)
return
}
utf8Reader :=transform.NewReader(resp.Body,simplifiedchinese.GBK.NewDecoder())
all, err :=ioutil.ReadAll(utf8Reader)
if err!=nil {
panic(err)
}
fmt.Printf("%s\n",all)
}
改进
可以看到上面的代码通用型不强,你就这么敢肯定网页编码一定是gbk
?
还有一个库,可以自动帮助检测网页的编码
gopm get -g -v golang.org/x/net/html
package main
import (
"net/http"
"io/ioutil"
"fmt"
"golang.org/x/text/transform"
"io"
"golang.org/x/text/encoding"
"bufio"
"golang.org/x/net/html/charset"
)
func main(){
resp, err :=http.Get("https://www.zhenai.com/zhenghun")
if err!=nil{
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
fmt.Printf("Error Status Code: %d", resp.StatusCode)
return
}
e :=determinEncoding(resp.Body)
utf8Reader := transform.NewReader(resp.Body,e.NewDecoder())
all, err :=ioutil.ReadAll(utf8Reader)
if err!=nil {
panic(err)
}
fmt.Printf("%s\n",all)
}
func determinEncoding(r io.Reader) encoding.Encoding{
bytes, err :=bufio.NewReader(r).Peek(1024)
if err!=nil{
panic(err)
}
e, _, _ :=charset.DetermineEncoding(bytes,"")
return e
}