package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"net/http"
"strings"
)
var (
storyUrlChan chan string
maps map[string]string
title []string
contents []string
)
func main() {
storyUrlChan = make(chan string, 10000)
url := "https://book.qidian.com/info/1036504904/#Catalog"
maps = make(map[string]string)
maps["Content-type"] = "application/json;charset=utf-8"
maps["user-agent"] = "浏览器自己找"
maps["Cookie"] = "浏览器自己找"
// 伪造请求头
doc := GetUrlReader(url)
// 首先获取所有章节
go ParsePages(doc)
// 解析
urlToDocument()
// 保存到文件夹
printStory()
}
// GetUrlReader 获取url对应的页面可读流
func GetUrlReader(url string) *goquery.Document {
clent := &http.Client{}
request, err := http.NewRequest("GET", url, nil)
if err != nil {
fmt.Printf("构建请求出错:%v", request)
}
AddFiledInHead(request)
do, err := clent.Do(request)
if err != nil {
fmt.Printf("请求异常:%v", err)
}
defer do.Body.Close()
reader, err := goquery.NewDocumentFromReader(do.Body)
if err != nil {
fmt.Printf("NewDocumentFromReader err : %v", err)
}
return reader
}
func AddFiledInHead(req *http.Request) {
for key, value := range maps {
req.Header.Add(key, value)
}
}
// URL to document
func urlToDocument() {
for storyUrl := range storyUrlChan {
doc := GetUrlReader(storyUrl)
ParseStory(doc)
}
}
// ParseStory 解析小说
func ParseStory(doc *goquery.Document) {
allContent := doc.Find(".main-text-wrap")
// 章节前面已抓去
heard := allContent.Find(".text-head .content-wrap").Text()
content := allContent.Find(".read-content").Text()
heard = strings.Trim(heard, " ")
title = append(title, content)
content = strings.Trim(content, " ")
contents = append(contents, content)
fmt.Println(heard, content)
}
// ParsePages 解析章节,并存入channel管道
func ParsePages(doc *goquery.Document) {
doc.Find("#j-catalogWrap li").Each(func(i int, selection *goquery.Selection) {
attr, _ := selection.Find("a").Attr("href")
url := "https:" + attr
storyUrlChan <- url
})
defer close(storyUrlChan)
return
}
// 保存小说到文件夹
func printStory() {
// TODO
//想好在写,准备放入数据库
}
使用golang爬取某小说
最新推荐文章于 2025-07-03 20:07:49 发布