以前使用PhantomJS,但效果并不好,现在使用Golang+selenium 驱动 chrome headless 模式,再用 goquery 解析html 字符,很方便。
首先在服务器上安装最新版本的 chrome 和 chromedriver
- chrome https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
- chromedriver https://sites.google.com/a/chromium.org/chromedriver/downloads
安装
1
2
3
4
5
6
apt-get update
apt-get install libxss1 libappindicator1 libindicator7
apt-get install curl
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
dpkg -i google-chrome*.deb
apt-get install -f
1
2
3
4
wget https://chromedriver.storage.googleapis.com/2.43/chromedriver_linux64.zip
apt-get install unzip
unzip chromedriver_linux64.zip
cp chromedriver /usr/bin/
chrome headless 配置
启动 chrome 及简单配置
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
var opts []selenium.ServiceOption
caps := selenium.Capabilities{
"browserName": "chrome",
}
// 禁止加载图片,加快渲染速度
imgCaps := map[string]interface{}{
"profile.managed_default_content_settings.images": 2,
}
chromeCaps := chrome.Capabilities{
Prefs: imgCaps,
Path: "",
Args: []string{
"--headless",
"--start-maximized",
//"--window-size=1200x600",
"--no-sandbox",
"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
"--disable-gpu",
"--disable-impl-side-painting",
"--disable-gpu-sandbox",
"--disable-accelerated-2d-canvas",
"--disable-accelerated-jpeg-decoding",
"--test-type=ui",
},
}
caps.AddChrome(chromeCaps)
// 启动 chromedriver server
service, err := selenium.NewChromeDriverService("chromedriver", port, opts...)
if err != nil {
log.Printf("Error starting the ChromeDriver server: %v", err)
return
}
defer service.Stop()
打开一个网页
1
2
3
4
5
6
7
// 打开 chrome 浏览器
wd, err := selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", port))
if err != nil {
log.Println(err)
return
}
defer wd.Quit()
然后加载URL
1
2
3
4
err = wd.Get(curURL)
if err != nil {
log.Println(fmt.Sprintf("Failed to load page: %s\n", err))
}
判断加载完成
1
2
3
4
5
6
7
8
9
jsRt, err := wd.ExecuteScript("return document.readyState", nil)
if err != nil {
log.Println("exe js err", err)
}
fmt.Println("jsRt", jsRt)
if jsRt != "complete" {
log.Println("网页加载未完成")
return
}
获取网站内容
1
2
3
4
5
6
7
8
var frameHtml string
time.Sleep(1 * time.Second)
frameHtml, err = wd.PageSource()
if err != nil {
log.Println(err)
return
}
解析 html 文件
这里推荐使用 goquery
1
2
3
4
5
6
7
8
9
10
var doc *goquery.Document
doc, err = goquery.NewDocumentFromReader(bytes.NewReader([]byte(frameHtml)))
if err != nil {
log.Println(err)
return
}
doc.Find("li.s-result-item").Each(func(liIndex int, liItem *goquery.Selection) {
// do something
})
selenium go 驱动库
selenium https://github.com/tebeka/selenium
本文网址: https://golangnote.com/topic/232.html 转摘请注明来源