From 9688c5c3a7919137c93979d6ba39c1c64efff15f Mon Sep 17 00:00:00 2001 From: jacksontong Date: Sun, 22 May 2022 10:06:36 +0800 Subject: [PATCH] Code refactoring --- .gitignore | 41 +++++++++ Makefile | 48 ++++++++++ README | 2 +- build.sh | 78 ---------------- {src/main => cmd}/mini_spider.go | 47 ++++------ conf/spider.conf | 4 +- {data => conf}/url.data | 0 go.mod | 14 +++ go.sum | 10 ++ .../config}/conf_basic.go | 2 +- .../config}/config.go | 6 +- {src/mini_spider => pkg/crawler}/crawler.go | 29 +++--- pkg/crawler/crawler_test.go | 67 ++++++++++++++ pkg/http_util/http_util.go | 48 ++++++++++ pkg/http_util/http_util_test.go | 59 ++++++++++++ .../queue.go => pkg/model/task_queue.go | 29 +++--- .../model/task_queue_test.go | 26 ++---- {src/mini_spider => pkg/model}/url_table.go | 2 +- .../model}/url_table_test.go | 2 +- .../mini_spider => pkg/seed}/seedfile_load.go | 2 +- .../seed}/seedfile_load_test.go | 4 +- .../mini_spider => pkg/spider}/mini_spider.go | 35 +++---- pkg/spider/mini_spider_test.go | 27 ++++++ .../webpage}/webpage_parse.go | 4 +- .../webpage}/webpage_parse_test.go | 4 +- .../webpage}/webpage_save.go | 2 +- .../webpage}/webpage_save_test.go | 11 +-- src/.DS_Store | Bin 6148 -> 0 bytes src/mini_spider/crawler_test.go | 55 ----------- src/mini_spider/mini_spider_test.go | 28 ------ src/mini_spider/webpage_parse.go | 87 ------------------ src/mini_spider/webpage_save.go | 84 ----------------- src/mini_spider/webpage_save_test.go | 38 -------- src/web_package/webpage_parse_test.go | 34 ------- start.sh | 4 +- .../test_data => test}/seedfile/url.data | 0 .../test_data => test}/spider.conf | 6 +- 37 files changed, 413 insertions(+), 526 deletions(-) create mode 100644 Makefile delete mode 100644 build.sh rename {src/main => cmd}/mini_spider.go (63%) rename {data => conf}/url.data (100%) create mode 100644 go.mod create mode 100644 go.sum rename {src/mini_spider_config => pkg/config}/conf_basic.go (98%) rename {src/mini_spider_config => pkg/config}/config.go (94%) rename {src/mini_spider => pkg/crawler}/crawler.go (76%) create mode 100644 pkg/crawler/crawler_test.go create mode 100644 pkg/http_util/http_util.go create mode 100644 pkg/http_util/http_util_test.go rename src/mini_spider/queue.go => pkg/model/task_queue.go (80%) rename src/mini_spider/queue_test.go => pkg/model/task_queue_test.go (75%) rename {src/mini_spider => pkg/model}/url_table.go (96%) rename {src/mini_spider => pkg/model}/url_table_test.go (96%) rename {src/mini_spider => pkg/seed}/seedfile_load.go (97%) rename {src/mini_spider => pkg/seed}/seedfile_load_test.go (89%) rename {src/mini_spider => pkg/spider}/mini_spider.go (59%) create mode 100644 pkg/spider/mini_spider_test.go rename {src/web_package => pkg/webpage}/webpage_parse.go (98%) rename {src/mini_spider => pkg/webpage}/webpage_parse_test.go (88%) rename {src/web_package => pkg/webpage}/webpage_save.go (98%) rename {src/web_package => pkg/webpage}/webpage_save_test.go (74%) delete mode 100644 src/.DS_Store delete mode 100644 src/mini_spider/crawler_test.go delete mode 100644 src/mini_spider/mini_spider_test.go delete mode 100644 src/mini_spider/webpage_parse.go delete mode 100644 src/mini_spider/webpage_save.go delete mode 100644 src/mini_spider/webpage_save_test.go delete mode 100644 src/web_package/webpage_parse_test.go mode change 100644 => 100755 start.sh rename {src/mini_spider/test_data => test}/seedfile/url.data (100%) rename {src/mini_spider_config/test_data => test}/spider.conf (80%) diff --git a/.gitignore b/.gitignore index e43b0f9..1a56d97 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,42 @@ +# OSX leaves these everywhere on SMB shares +._* + +# OSX trash .DS_Store + +# Eclipse files +.classpath +.project +.settings/** + +# Files generated by JetBrains IDEs, e.g. IntelliJ IDEA +.idea/ +*.iml +.bin/ + +# Vscode files +.vscode + +# This is where the result of the go build goes +/output*/ +/_output*/ +/_output + +# Emacs save files +*~ +\#*\# +.\#* + +# Vim-related files +[._]*.s[a-w][a-z] +[._]s[a-w][a-z] +*.un~ +Session.vim +.netrwhist + +# Generate by makefile +bin +node_modules +package.json +yarn.lock + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1b53554 --- /dev/null +++ b/Makefile @@ -0,0 +1,48 @@ +# init project path +HOMEDIR := $(shell pwd) +OUTDIR := $(HOMEDIR)/output +BIN := mini_spider + +VERSION=`git describe --always` +BUILDTIME=`date +%FT%T%z` +GOVERSION=`go version` + +# init command params +GO := go +GOOS := linux +GOBUILD := $(GO) build +GOTEST := $(GO) test +GOPKGS := $$($(GO) list ./pkg... | grep -vE "vendor") + +# Setup the -ldflags option for go build here, interpolate the variable values +LDFLAGS=-ldflags "-w -s -X main.Version=${VERSION} -X 'main.BuildTime=${BUILDTIME}' -X 'main.GoVersion=${GOVERSION}'" + +# make, make all +all: prepare compile package + +# make prepare +prepare: + +# make compile, go build +compile: build +build: + GOOS=$(GOOS) $(GOBUILD) ${LDFLAGS} -o $(HOMEDIR)/$(BIN) cmd/mini_spider.go + +# make test, test your code +test: test-case +test-case: + $(GOTEST) -v -cover $(GOPKGS) + rm -r test/webpage + +# make package +package: package-bin +package-bin: + mkdir -p $(OUTDIR) + mv $(HOMEDIR)/$(BIN) $(OUTDIR)/ + +# make clean +clean: + rm -rf $(OUTDIR) + +# avoid filename conflict and speed up build +.PHONY: all prepare compile test package clean build diff --git a/README b/README index 4421fc8..0108c35 100644 --- a/README +++ b/README @@ -2,7 +2,7 @@ 在调研过程中,经常需要对一些网站进行定向抓取。由于go包含各种强大的库,使用go做定向抓取比较简单。请使用go开发一个迷你定向抓取器mini_spider,实现对种子链接的抓取,并把URL长相符合特定pattern的网页保存到磁盘上。 [程序运行]: - ./mini_spider -c ../conf -l ../log + ./mini_spider -c ../conf/spider.conf -l ./log [配置文件spider.conf]: [spider] diff --git a/build.sh b/build.sh deleted file mode 100644 index 72dea6b..0000000 --- a/build.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/bash - -### dir structure -# /bfe -# /bfe-common -# /go -# /output -# /golang-lib -# /mini_spider -# build.sh - -### restore working dir -WORKROOT=$(pwd) - -cd ${WORKROOT} - -# prepare PATH, GOROOT and GOPATH -export GOPATH=$(pwd) - -# export golang-lib to GOPATH -cd ${WORKROOT} -export GOPATH=$(pwd)/../bfe-common/golang-lib:$GOPATH - -# run go test for all subdirectory -cd ${WORKROOT}/src/mini_spider -go test -c -o ./testRun -if [ $? -ne 0 ]; -then - echo "go compile test failed" - exit 1 -fi - -go test -run testRun -if [ $? -ne 0 ]; -then - echo "go run test failed" - exit 1 -fi -rm -rf ./testRun -echo "OK for go test" - -### build -cd ${WORKROOT}/src/main -go build -o mini_spider -if [ $? -ne 0 ]; -then - echo "fail to go build mini_spider.go" - exit 1 -fi -echo "OK for go build mini_spider.go" - -### create directory for output -cd ../../ -if [ -d "./output" ] -then - rm -rf output -fi -mkdir output - -# copy config -mkdir output/conf -cp conf/spider.conf output/conf - -# copy data -cp -r data output/ - -# copy file to bin -mkdir output/bin -mv src/main/mini_spider output/bin - -# create dir for log -mkdir output/log - -# change mode of files in /bin -chmod +x output/bin/mini_spider - - -echo "OK for build mini_spider" diff --git a/src/main/mini_spider.go b/cmd/mini_spider.go similarity index 63% rename from src/main/mini_spider.go rename to cmd/mini_spider.go index ff3de6b..e1bc465 100644 --- a/src/main/mini_spider.go +++ b/cmd/mini_spider.go @@ -18,28 +18,25 @@ import ( "os/signal" "syscall" "time" -) - -import ( - "code.google.com/p/log4go" - "www.baidu.com/golang-lib/log" -) + "github.com/baidu/go-lib/log" -import ( - "mini_spider_config" - "mini_spider" + mini_spider_config "github.com/cumirror/mini-spider/pkg/config" + "github.com/cumirror/mini-spider/pkg/seed" + mini_spider "github.com/cumirror/mini-spider/pkg/spider" ) var ( - confPath *string = flag.String("c", "../conf/spider.conf", "mini_spider configure path") - help *bool = flag.Bool("h", false, "show help") - logPath *string = flag.String("l", "../log", "dir path of log") - showVer *bool = flag.Bool("v", false, "show version") - stdOut *bool = flag.Bool("s", false, "to show log in stdout") - debugLog *bool = flag.Bool("d", false, "to show debug log (otherwise >= info)") + confPath *string = flag.String("c", "../conf/spider.conf", "mini_spider configure path") + help *bool = flag.Bool("h", false, "show help") + logPath *string = flag.String("l", "../log", "dir path of log") + showVer *bool = flag.Bool("v", false, "show version") + stdOut *bool = flag.Bool("s", false, "to show log in stdout") + debugLog *bool = flag.Bool("d", false, "to show debug log (otherwise >= info)") ) +var Version, BuildTime, GoVersion string + func Exit(code int) { log.Logger.Close() /* to overcome bug in log, sleep for a while */ @@ -57,7 +54,7 @@ func main() { return } if *showVer { - fmt.Printf("version is: 1.0.0\n") + fmt.Println("Version:", Version, "Build:", BuildTime, "Go:", GoVersion) return } @@ -69,14 +66,6 @@ func main() { } fmt.Printf("mini_spider starts...\n") - /* initialize log */ - /* set log buffer size */ - log4go.SetLogBufferLength(10000) - /* if blocking, log will be dropped */ - log4go.SetLogWithBlocking(false) - /* we want to get state of log4go */ - log4go.SetWithModuleState(true) - err := log.Init("mini_spider", logSwitch, *logPath, *stdOut, "midnight", 5) if err != nil { fmt.Printf("main(): err in log.Init():%s\n", err.Error()) @@ -91,14 +80,14 @@ func main() { } // load seeds - seeds, err := mini_spider.LoadSeedFile(config.Basic.UrlListFile) + seeds, err := seed.LoadSeedFile(config.Basic.UrlListFile) if err != nil { log.Logger.Error("main():err in loadSeedFile(%s):%s", config.Basic.UrlListFile, err.Error()) Exit(1) } // create mini-spider - miniSpider, err:= mini_spider.NewMiniSpider(&config, seeds) + miniSpider, err := mini_spider.NewMiniSpider(&config, seeds) if err != nil { log.Logger.Error("main():err in NewMiniSpider():%s", err.Error()) Exit(1) @@ -107,7 +96,7 @@ func main() { // run mini-spider miniSpider.Run() - // waiting for all tasks to finish. + // waiting for all tasks to finish. go func() { for { if miniSpider.GetUnfinished() == 0 { @@ -120,8 +109,8 @@ func main() { // sleep for a while time.Sleep(5 * time.Second) } - } () - + }() + // Handle SIGINT and SIGTERM. ch := make(chan os.Signal) signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM) diff --git a/conf/spider.conf b/conf/spider.conf index 169007b..97650db 100644 --- a/conf/spider.conf +++ b/conf/spider.conf @@ -1,9 +1,9 @@ [Basic] # 种子文件路径 -urlListFile = ../data/url.data +urlListFile = ../conf/url.data # 抓取结果存储目录 -outputDirectory = ../webpage +outputDirectory = ./webpage # 最大抓取深度(种子为0级) maxDepth = 2 diff --git a/data/url.data b/conf/url.data similarity index 100% rename from data/url.data rename to conf/url.data diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..d23cce7 --- /dev/null +++ b/go.mod @@ -0,0 +1,14 @@ +module github.com/cumirror/mini-spider + +go 1.18 + +require ( + github.com/baidu/go-lib v0.0.0-20210902034828-42829d4bdecd + golang.org/x/net v0.0.0-20220520000938-2e3eb7b945c2 + gopkg.in/gcfg.v1 v1.2.3 +) + +require ( + github.com/jehiah/go-strftime v0.0.0-20171201141054-1d33003b3869 // indirect + gopkg.in/warnings.v0 v0.1.2 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..2704257 --- /dev/null +++ b/go.sum @@ -0,0 +1,10 @@ +github.com/baidu/go-lib v0.0.0-20210902034828-42829d4bdecd h1:AQ/hNi9wwj7mm06Zwycxp++X3cKKMAz8mxjNYSIaj8U= +github.com/baidu/go-lib v0.0.0-20210902034828-42829d4bdecd/go.mod h1:FneHDqz3wLeDGdWfRyW4CzBbCwaqesLGIFb09N80/ww= +github.com/jehiah/go-strftime v0.0.0-20171201141054-1d33003b3869 h1:IPJ3dvxmJ4uczJe5YQdrYB16oTJlGSC/OyZDqUk9xX4= +github.com/jehiah/go-strftime v0.0.0-20171201141054-1d33003b3869/go.mod h1:cJ6Cj7dQo+O6GJNiMx+Pa94qKj+TG8ONdKHgMNIyyag= +golang.org/x/net v0.0.0-20220520000938-2e3eb7b945c2 h1:NWy5+hlRbC7HK+PmcXVUmW1IMyFce7to56IUvhUFm7Y= +golang.org/x/net v0.0.0-20220520000938-2e3eb7b945c2/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= +gopkg.in/gcfg.v1 v1.2.3 h1:m8OOJ4ccYHnx2f4gQwpno8nAX5OGOh7RLaaz0pj3Ogs= +gopkg.in/gcfg.v1 v1.2.3/go.mod h1:yesOnuUOFQAhST5vPY4nbZsb/huCgGGXlipJsBn0b3o= +gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME= +gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= diff --git a/src/mini_spider_config/conf_basic.go b/pkg/config/conf_basic.go similarity index 98% rename from src/mini_spider_config/conf_basic.go rename to pkg/config/conf_basic.go index 14270c2..1f14158 100644 --- a/src/mini_spider_config/conf_basic.go +++ b/pkg/config/conf_basic.go @@ -8,7 +8,7 @@ modification history DESCRIPTION */ -package mini_spider_config +package config import ( "fmt" diff --git a/src/mini_spider_config/config.go b/pkg/config/config.go similarity index 94% rename from src/mini_spider_config/config.go rename to pkg/config/config.go index 9dfea73..13ddeb9 100644 --- a/src/mini_spider_config/config.go +++ b/pkg/config/config.go @@ -8,14 +8,12 @@ modification history DESCRIPTION */ -package mini_spider_config +package config import ( "fmt" -) -import ( - "code.google.com/p/gcfg" + "gopkg.in/gcfg.v1" ) type MiniSpiderConf struct { diff --git a/src/mini_spider/crawler.go b/pkg/crawler/crawler.go similarity index 76% rename from src/mini_spider/crawler.go rename to pkg/crawler/crawler.go index c4e9c14..fd29a58 100644 --- a/src/mini_spider/crawler.go +++ b/pkg/crawler/crawler.go @@ -9,34 +9,31 @@ modification history /* DESCRIPTION */ -package mini_spider +package crawler import ( "fmt" "regexp" "time" -) -import ( - "www.baidu.com/golang-lib/http_util" - "www.baidu.com/golang-lib/log" -) + "github.com/baidu/go-lib/log" -import ( - "mini_spider_config" - "web_package" + mini_spider_config "github.com/cumirror/mini-spider/pkg/config" + "github.com/cumirror/mini-spider/pkg/http_util" + "github.com/cumirror/mini-spider/pkg/model" + web_package "github.com/cumirror/mini-spider/pkg/webpage" ) type Crawler struct { - urlTable *UrlTable + urlTable *model.UrlTable config *mini_spider_config.BasicConfig - queue *Queue + queue *model.Queue urlPattern *regexp.Regexp stop bool } // create new crawler -func NewCrawler(urlTable *UrlTable, config *mini_spider_config.MiniSpiderConf, queue *Queue) *Crawler { +func NewCrawler(urlTable *model.UrlTable, config *mini_spider_config.MiniSpiderConf, queue *model.Queue) *Crawler { c := new(Crawler) c.urlTable = urlTable c.config = &config.Basic @@ -81,7 +78,7 @@ func (c *Crawler) Run() { // continue crawling until max depth if task.Depth < c.config.MaxDepth { err = c.crawlChild(data, task) - if(err != nil){ + if err != nil { log.Logger.Error("crawlChild(%s):%s in depth of %d", task.Url, err.Error(), task.Depth) } } @@ -90,7 +87,7 @@ func (c *Crawler) Run() { c.queue.FinishOneTask() // sleep for a while - time.Sleep(time.Duration(c.config.CrawlInterval) * time.Second) + time.Sleep(time.Duration(c.config.CrawlInterval) * time.Second) } } @@ -100,7 +97,7 @@ func (c *Crawler) Stop() { } // crawl child url -func (c *Crawler) crawlChild(data []byte, task *CrawlTask ) error { +func (c *Crawler) crawlChild(data []byte, task *model.CrawlTask) error { // parse url from web page links, err := web_package.ParseWebPage(data, task.Url) if err != nil { @@ -114,7 +111,7 @@ func (c *Crawler) crawlChild(data []byte, task *CrawlTask ) error { continue } - taskNew := &CrawlTask{Url: link, Depth: task.Depth + 1, Header: make(map[string]string)} + taskNew := &model.CrawlTask{Url: link, Depth: task.Depth + 1, Header: make(map[string]string)} log.Logger.Debug("add to queue: url=%s, depth=%d", taskNew.Url, taskNew.Depth) c.queue.Add(taskNew) } diff --git a/pkg/crawler/crawler_test.go b/pkg/crawler/crawler_test.go new file mode 100644 index 0000000..e8834cf --- /dev/null +++ b/pkg/crawler/crawler_test.go @@ -0,0 +1,67 @@ +/* crawler_test.go: test for crawler */ +/* +modification history +-------------------- +2017/07/21, by Xiongmin LIN, create +*/ +/* +DESCRIPTION +*/ + +package crawler + +import ( + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" + + mini_spider_config "github.com/cumirror/mini-spider/pkg/config" + "github.com/cumirror/mini-spider/pkg/model" +) + +func TestCrawler(t *testing.T) { + html := ` + + + + + mini-spider + + +

Absolute Path: a

+

Relative Path: b

+ + +` + svr := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + fmt.Fprintf(w, html) + })) + defer svr.Close() + + urlTable := model.NewUrlTable() + conf, _ := mini_spider_config.LoadConfig("../../test/spider.conf") + + var queue model.Queue + queue.Init() + queue.Add(&model.CrawlTask{svr.URL, 1, nil}) + + c := NewCrawler(urlTable, &conf, &queue) + + go c.Run() // can't stop, so remove waitGroup + time.Sleep(time.Second * 5) + + // check visit result + verifyLinks := []string{ + "https://just998.com/xianbao/41834973.html", // use small site which would not get 403 error + svr.URL + "/b.html", + } + + for _, link := range verifyLinks { + if !c.urlTable.Exist(link) { + t.Errorf("%s not visited", link) + } + } +} diff --git a/pkg/http_util/http_util.go b/pkg/http_util/http_util.go new file mode 100644 index 0000000..12ab02e --- /dev/null +++ b/pkg/http_util/http_util.go @@ -0,0 +1,48 @@ +package http_util + +import ( + "crypto/tls" + "fmt" + "io/ioutil" + "net/http" + "time" +) + +func Read(urlPath string, timeout int, headers map[string]string) ([]byte, error) { + // prepare http request + req, err := http.NewRequest("GET", urlPath, nil) + if err != nil { + return nil, fmt.Errorf("http.NewRequest():%s", err.Error()) + } + for key, value := range headers { + req.Header.Set(key, value) + } + + // prepare http client + client := http.Client{ + Timeout: time.Duration(timeout) * time.Second, + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + //DisableKeepAlives: true, + }, + } + + // send request + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("http client.do():%s", err.Error()) + } + defer resp.Body.Close() + + // check status code + if resp.StatusCode != 200 { + return nil, fmt.Errorf("status code:%d", resp.StatusCode) + } + + // read from response + bytes, err := ioutil.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("ReadAll():%s", err.Error()) + } + return bytes, nil +} diff --git a/pkg/http_util/http_util_test.go b/pkg/http_util/http_util_test.go new file mode 100644 index 0000000..d810eca --- /dev/null +++ b/pkg/http_util/http_util_test.go @@ -0,0 +1,59 @@ +package http_util + +import ( + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestHttpRead(t *testing.T) { + timeout := time.Second * 2 + header := map[string]string{ + "User": "mini-spider", + } + rspData := "OK" + svr := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/ok": + w.WriteHeader(http.StatusOK) + fmt.Fprintf(w, rspData) + case "/timeout": + time.Sleep(timeout) + w.WriteHeader(http.StatusOK) + fmt.Fprintf(w, rspData) + case "/header": + if r.Header["User"][0] == "mini-spider" { + w.WriteHeader(http.StatusOK) + fmt.Fprintf(w, rspData) + } else { + w.WriteHeader(http.StatusBadRequest) + } + } + })) + defer svr.Close() + + // 1.test ok + data, err := Read(svr.URL+"/ok", 0, nil) + if err != nil || string(data) != rspData { + t.Errorf("Read not as expect: %s %s", string(data), err) + } + + // 2.1 test timeout: Within the threshold(2s) + data, err = Read(svr.URL+"/timeout", 3, nil) + if err != nil || string(data) != rspData { + t.Errorf("Read not as expect: %s %s", string(data), err) + } + // 2.2 test timeout: Outside the threshold(2s) + data, err = Read(svr.URL+"/timeout", 1, nil) + if err == nil { + t.Error("Read not as expect") + } + + // 3.test header + data, err = Read(svr.URL+"/header", 0, header) + if err != nil || string(data) != rspData { + t.Errorf("Read not as expect: %s %s", string(data), err) + } +} diff --git a/src/mini_spider/queue.go b/pkg/model/task_queue.go similarity index 80% rename from src/mini_spider/queue.go rename to pkg/model/task_queue.go index c0c4672..ce8486d 100644 --- a/src/mini_spider/queue.go +++ b/pkg/model/task_queue.go @@ -7,7 +7,7 @@ modification history /* DESCRIPTION */ -package mini_spider +package model import ( "container/list" @@ -17,16 +17,23 @@ import ( /* max queue length */ const ( - MAX_QUEUE_LEN = 65535 + MAX_QUEUE_LEN = 65535 ) +// crawl task +type CrawlTask struct { + Url string // url to crawl + Depth int // depth of the url + Header map[string]string // http header +} + /* queue */ type Queue struct { - lock sync.Mutex - cond *sync.Cond - tasks *list.List - maxLen int // max queue length - unfinished int // number of unfinished tasks + lock sync.Mutex + cond *sync.Cond + tasks *list.List + maxLen int // max queue length + unfinished int // number of unfinished tasks } /* Initialize the queue */ @@ -51,7 +58,7 @@ func (q *Queue) Add(task *CrawlTask) error { q.cond.Signal() err = nil } - + return err } @@ -77,7 +84,7 @@ func (q *Queue) Len() int { var len int len = q.tasks.Len() - + return len } @@ -93,9 +100,9 @@ func (q *Queue) FinishOneTask() { func (q *Queue) GetUnfinished() int { q.lock.Lock() defer q.lock.Unlock() - + ret := q.unfinished - + return ret } diff --git a/src/mini_spider/queue_test.go b/pkg/model/task_queue_test.go similarity index 75% rename from src/mini_spider/queue_test.go rename to pkg/model/task_queue_test.go index ac243f2..b947796 100644 --- a/src/mini_spider/queue_test.go +++ b/pkg/model/task_queue_test.go @@ -7,7 +7,7 @@ modification history /* DESCRIPTION */ -package mini_spider +package model import ( "testing" @@ -19,7 +19,7 @@ func TestQueue(t *testing.T) { queue.Add(&CrawlTask{"http://www.baidu.com", 2, nil}) queue.Add(&CrawlTask{"http://www.sina.com", 1, nil}) - + queue.SetMaxLen(2) err := queue.Add(&CrawlTask{"http://www.test.com", 1, nil}) @@ -30,23 +30,23 @@ func TestQueue(t *testing.T) { task := queue.Pop() if task == nil || task.Url != "http://www.baidu.com" { - t.Errorf("www.baidu.com should be poped first") - } + t.Errorf("www.baidu.com should be poped first") + } qLen := queue.Len() if qLen != 1 { - t.Error("queue.Len() should be 1, now it's %d", qLen) + t.Errorf("queue.Len() should be 1, now it's %d", qLen) } - + n := queue.GetUnfinished() if n != 2 { - t.Error("queue.GetUnfinished() should be 2, now it's %d", n) + t.Errorf("queue.GetUnfinished() should be 2, now it's %d", n) } - + queue.FinishOneTask() n = queue.GetUnfinished() if n != 1 { - t.Error("queue.GetUnfinished() should be 1, now it's %d", n) + t.Errorf("queue.GetUnfinished() should be 1, now it's %d", n) } task = queue.Pop() @@ -54,15 +54,9 @@ func TestQueue(t *testing.T) { t.Error("queue should pop http://www.sina.com with depth 1") } - task = queue.Pop() - if task != nil { - t.Error("queue should pop nil") - } - queue.FinishOneTask() n = queue.GetUnfinished() if n != 0 { t.Error("all tasks in queue had finished, GetUnfinished() should return 0") } - -} \ No newline at end of file +} diff --git a/src/mini_spider/url_table.go b/pkg/model/url_table.go similarity index 96% rename from src/mini_spider/url_table.go rename to pkg/model/url_table.go index 204b4d6..1cc6680 100644 --- a/src/mini_spider/url_table.go +++ b/pkg/model/url_table.go @@ -7,7 +7,7 @@ modification history /* DESCRIPTION */ -package mini_spider +package model import ( "sync" diff --git a/src/mini_spider/url_table_test.go b/pkg/model/url_table_test.go similarity index 96% rename from src/mini_spider/url_table_test.go rename to pkg/model/url_table_test.go index d09fde6..fa776e5 100644 --- a/src/mini_spider/url_table_test.go +++ b/pkg/model/url_table_test.go @@ -7,7 +7,7 @@ modification history /* DESCRIPTION */ -package mini_spider +package model import ( "testing" diff --git a/src/mini_spider/seedfile_load.go b/pkg/seed/seedfile_load.go similarity index 97% rename from src/mini_spider/seedfile_load.go rename to pkg/seed/seedfile_load.go index 4fbafcd..c4133bf 100644 --- a/src/mini_spider/seedfile_load.go +++ b/pkg/seed/seedfile_load.go @@ -8,7 +8,7 @@ modification history DESCRIPTION */ -package mini_spider +package seed import ( "encoding/json" diff --git a/src/mini_spider/seedfile_load_test.go b/pkg/seed/seedfile_load_test.go similarity index 89% rename from src/mini_spider/seedfile_load_test.go rename to pkg/seed/seedfile_load_test.go index 9a33036..6138a96 100644 --- a/src/mini_spider/seedfile_load_test.go +++ b/pkg/seed/seedfile_load_test.go @@ -7,7 +7,7 @@ modification history /* DESCRIPTION */ -package mini_spider +package seed import ( "testing" @@ -15,7 +15,7 @@ import ( // test for LoadSeedFile() func TestLoadSeedFile(t *testing.T) { - filePath := "./test_data/seedfile/url.data" + filePath := "../../test/seedfile/url.data" seeds, err := LoadSeedFile(filePath) if err != nil { t.Errorf("err in seedFileLoad(%s):%s", filePath, err.Error()) diff --git a/src/mini_spider/mini_spider.go b/pkg/spider/mini_spider.go similarity index 59% rename from src/mini_spider/mini_spider.go rename to pkg/spider/mini_spider.go index 55898ad..209d941 100644 --- a/src/mini_spider/mini_spider.go +++ b/pkg/spider/mini_spider.go @@ -9,24 +9,19 @@ DESCRIPTION - mini spider */ -package mini_spider +package spider import ( - "mini_spider_config" + mini_spider_config "github.com/cumirror/mini-spider/pkg/config" + "github.com/cumirror/mini-spider/pkg/crawler" + "github.com/cumirror/mini-spider/pkg/model" ) type MiniSpider struct { config *mini_spider_config.MiniSpiderConf - urlTable *UrlTable - queue Queue - crawlers []*Crawler -} - -// crawl task -type CrawlTask struct { - Url string // url to crawl - Depth int // depth of the url - Header map[string]string // http header + urlTable *model.UrlTable + queue model.Queue + crawlers []*crawler.Crawler } // create new mini-spider @@ -35,22 +30,21 @@ func NewMiniSpider(conf *mini_spider_config.MiniSpiderConf, seeds []string) (*Mi ms.config = conf // create url table - ms.urlTable = NewUrlTable() + ms.urlTable = model.NewUrlTable() // initialize queue ms.queue.Init() // add seeds to queue for _, seed := range seeds { - task := &CrawlTask{Url: seed, Depth: 1, Header: make(map[string]string)} + task := &model.CrawlTask{Url: seed, Depth: 1, Header: make(map[string]string)} ms.queue.Add(task) } // create crawlers, thread count was defined in conf - ms.crawlers = make([]*Crawler, 0) + ms.crawlers = make([]*crawler.Crawler, 0) for i := 0; i < conf.Basic.ThreadCount; i++ { - crawler := NewCrawler(ms.urlTable, ms.config, &ms.queue) - ms.crawlers = append(ms.crawlers, crawler) + ms.crawlers = append(ms.crawlers, crawler.NewCrawler(ms.urlTable, ms.config, &ms.queue)) } return ms, nil @@ -59,13 +53,12 @@ func NewMiniSpider(conf *mini_spider_config.MiniSpiderConf, seeds []string) (*Mi // run mini spider func (ms *MiniSpider) Run() { // start all crawlers - for _, crawler := range ms.crawlers { - go crawler.Run() + for _, c := range ms.crawlers { + go c.Run() } } // get number of unfinished task func (ms *MiniSpider) GetUnfinished() int { - return ms.queue.GetUnfinished() -} \ No newline at end of file +} diff --git a/pkg/spider/mini_spider_test.go b/pkg/spider/mini_spider_test.go new file mode 100644 index 0000000..fb2a9b0 --- /dev/null +++ b/pkg/spider/mini_spider_test.go @@ -0,0 +1,27 @@ +/* mini_spider_test.go: test for mini_spider.go */ +/* +modification history +-------------------- +2017/07/21, by Xiongmin LIN, create +*/ +/* +DESCRIPTION +*/ + +package spider + +import ( + "testing" + + mini_spider_config "github.com/cumirror/mini-spider/pkg/config" + "github.com/cumirror/mini-spider/pkg/seed" +) + +func TestNewMiniSpider(t *testing.T) { + conf, _ := mini_spider_config.LoadConfig("./test/spider.conf") + seeds, _ := seed.LoadSeedFile(conf.Basic.UrlListFile) + _, err := NewMiniSpider(&conf, seeds) + if err != nil { + t.Errorf("err happen in NewMiniSpider:%s", err.Error()) + } +} diff --git a/src/web_package/webpage_parse.go b/pkg/webpage/webpage_parse.go similarity index 98% rename from src/web_package/webpage_parse.go rename to pkg/webpage/webpage_parse.go index 0f77d70..aa11729 100644 --- a/src/web_package/webpage_parse.go +++ b/pkg/webpage/webpage_parse.go @@ -7,7 +7,7 @@ modification history /* DESCRIPTION */ -package web_package +package webpage import ( "bytes" @@ -78,7 +78,7 @@ func ParseWebPage(data []byte, urlStr string) ([]string, error) { if err != nil { return nil, fmt.Errorf("url.ParseRequestURI(%s):%s", urlStr, err.Error()) } - + // get all links hl := NewHtmlLinks() hl.getLinks(doc, refUrl) diff --git a/src/mini_spider/webpage_parse_test.go b/pkg/webpage/webpage_parse_test.go similarity index 88% rename from src/mini_spider/webpage_parse_test.go rename to pkg/webpage/webpage_parse_test.go index f92cf46..77b8293 100644 --- a/src/mini_spider/webpage_parse_test.go +++ b/pkg/webpage/webpage_parse_test.go @@ -7,7 +7,7 @@ modification history /* DESCRIPTION */ -package mini_spider +package webpage import ( "testing" @@ -17,7 +17,7 @@ import ( func TestParseWebPage(t *testing.T) { s := []byte(`

Links:

`) - links, err := parseWebPage(s, "http://www.baidu.com/a/b.html") + links, err := ParseWebPage(s, "http://www.baidu.com/a/b.html") if err != nil { t.Errorf("err in parseWebPage():%s", err.Error()) return diff --git a/src/web_package/webpage_save.go b/pkg/webpage/webpage_save.go similarity index 98% rename from src/web_package/webpage_save.go rename to pkg/webpage/webpage_save.go index d94c119..b19694f 100644 --- a/src/web_package/webpage_save.go +++ b/pkg/webpage/webpage_save.go @@ -7,7 +7,7 @@ modification history /* DESCRIPTION */ -package web_package +package webpage import ( "fmt" diff --git a/src/web_package/webpage_save_test.go b/pkg/webpage/webpage_save_test.go similarity index 74% rename from src/web_package/webpage_save_test.go rename to pkg/webpage/webpage_save_test.go index 3016d74..31bce7f 100644 --- a/src/web_package/webpage_save_test.go +++ b/pkg/webpage/webpage_save_test.go @@ -7,7 +7,7 @@ modification history /* DESCRIPTION */ -package web_package +package webpage import ( "testing" @@ -15,24 +15,23 @@ import ( // test for genFilePath() func TestGenFilePath(t *testing.T) { - rootPath := "./output" + rootPath := "../../test/webpage/output" url := "www.baidu.com" filePath := genFilePath(url, rootPath) - if filePath != "./output/www.baidu.com" { + if filePath != "../../test/webpage/output/www.baidu.com" { t.Errorf("err in genFilePath(), filePath=%s", filePath) } } // test for saveWebPage() func TestSaveWebPage(t *testing.T) { - rootPath := "./output" + rootPath := "../../test/webpage/output" url := "www.baidu.com" data := []byte("this is a test") - err := saveWebPage(rootPath, url, data) + err := SaveWebPage(rootPath, url, data) if err != nil { t.Errorf("err in saveWebPage(%s, %s):%s", rootPath, url, err.Error()) } } - diff --git a/src/.DS_Store b/src/.DS_Store deleted file mode 100644 index c5bfff63ca6c3222b9116e402ae7d624fabb0a68..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKyG{c!5S)c8BBe=5>0jUvtSEc|AHeGXS4p+u)EY0#{+9(%lN z%TwIm0@7OVjRuS7>`f;$HQSii(Do=cfczi@PfC~{545Z z^gA5!P8{)!kJFIlQ?(-nq<|EV0#ZNM1NQZGAy#%p2LF|Q7A{m-xDlt>7S`5o{@~z@};gp!^u(+A$)Xi2MipA~ZTZF@U zqDCno1PJF diff --git a/src/mini_spider/crawler_test.go b/src/mini_spider/crawler_test.go deleted file mode 100644 index fde7a1e..0000000 --- a/src/mini_spider/crawler_test.go +++ /dev/null @@ -1,55 +0,0 @@ -/* crawler_test.go: test for crawler */ -/* -modification history --------------------- -2017/07/21, by Xiongmin LIN, create -*/ -/* -DESCRIPTION -*/ - -package mini_spider - -import ( - "sync" - "testing" -) - -import ( - "mini_spider_config" -) - -func TestCrawler(t *testing.T) { - urlTable := NewUrlTable() - conf, _ := mini_spider_config.LoadConfig("../mini_spider_config/test_data/spider.conf") - - var queue Queue - queue.Init() - queue.Add(&CrawlTask{"http://pycm.baidu.com:8081", 2, nil}) - - c := NewCrawler(urlTable, &conf, &queue) - - // wait and check result - var wg sync.WaitGroup - wg.Add(1) - go func() { - c.Run() - wg.Done() - }() - wg.Wait() - - // check visit result - verifyLinks := []string{ - "http://pycm.baidu.com:8081/page1.html", - "http://pycm.baidu.com:8081/page2.html", - "http://pycm.baidu.com:8081/page3.html", - "http://pycm.baidu.com:8081/mirror/index.html", - "http://pycm.baidu.com:8081/page4.html", - } - - for _, link := range verifyLinks { - if !c.urlTable.Exist(link) { - t.Errorf("%s not visited", link) - } - } -} diff --git a/src/mini_spider/mini_spider_test.go b/src/mini_spider/mini_spider_test.go deleted file mode 100644 index 7545c33..0000000 --- a/src/mini_spider/mini_spider_test.go +++ /dev/null @@ -1,28 +0,0 @@ -/* mini_spider_test.go: test for mini_spider.go */ -/* -modification history --------------------- -2017/07/21, by Xiongmin LIN, create -*/ -/* -DESCRIPTION -*/ - -package mini_spider - -import ( - "testing" -) - -import ( - "mini_spider_config" -) - -func TestNewMiniSpider(t *testing.T) { - conf, _ := mini_spider_config.LoadConfig("../mini_spider_config/test_data/spider.conf") - seeds, _ := LoadSeedFile(conf.Basic.UrlListFile) - _, err := NewMiniSpider(&conf, seeds) - if err != nil { - t.Errorf("err happen in NewMiniSpider:%s", err.Error()) - } -} diff --git a/src/mini_spider/webpage_parse.go b/src/mini_spider/webpage_parse.go deleted file mode 100644 index 77a8ea7..0000000 --- a/src/mini_spider/webpage_parse.go +++ /dev/null @@ -1,87 +0,0 @@ -/* webpage_parse.go - parse urls from web page */ -/* -modification history --------------------- -2017/07/18, by Xiongmin LIN, create -*/ -/* -DESCRIPTION -*/ -package mini_spider - -import ( - "bytes" - "fmt" - "net/url" -) - -import ( - "golang.org/x/net/html" -) - -type HtmlLinks struct { - links []string -} - -// create new HtmlLinks -func NewHtmlLinks() *HtmlLinks { - hl := new(HtmlLinks) - hl.links = make([]string, 0) - return hl -} - -/* -get all href in given html node - -Params: - - n: html node - - refUrl: reference url -*/ -func (hl *HtmlLinks) getLinks(n *html.Node, refUrl *url.URL) { - if n.Type == html.ElementNode && n.Data == "a" { - for _, a := range n.Attr { - if a.Key == "href" { - linkUrl, err := refUrl.Parse(a.Val) - if err == nil { - hl.links = append(hl.links, linkUrl.String()) - } - break - } - } - } - - for c := n.FirstChild; c != nil; c = c.NextSibling { - hl.getLinks(c, refUrl) - } -} - -/* -get url links in given html page - -Params: - - data: data for html page - - urlStr: url string of this html page - -Returns: - - links: parsed links - - error: any failure -*/ -func parseWebPage(data []byte, urlStr string) ([]string, error) { - // parse html - doc, err := html.Parse(bytes.NewReader(data)) - if err != nil { - return nil, fmt.Errorf("html.Parse():%s", err.Error()) - } - - // parse url - refUrl, err := url.ParseRequestURI(urlStr) - if err != nil { - return nil, fmt.Errorf("url.ParseRequestURI(%s):%s", urlStr, err.Error()) - } - - // get all links - hl := NewHtmlLinks() - hl.getLinks(doc, refUrl) - - return hl.links, nil -} diff --git a/src/mini_spider/webpage_save.go b/src/mini_spider/webpage_save.go deleted file mode 100644 index a63e16e..0000000 --- a/src/mini_spider/webpage_save.go +++ /dev/null @@ -1,84 +0,0 @@ -/* webpage_save.go */ -/* -modification history --------------------- -2017/07/21, by Xiongmin LIN, create -*/ -/* -DESCRIPTION -*/ -package mini_spider - -import ( - "fmt" - "io/ioutil" - "net/url" - "os" - "path" -) - -const ( - OutputFileMode = 0644 -) - -/* -generate root directory to save web page - -Params: - - rootPath: root path for saving file -Returns: - - error: any failure -*/ -func genRootDir(rootPath string) error{ - if _, err := os.Stat(rootPath); os.IsNotExist(err) { - if os.MkdirAll(rootPath, 0777) != nil { - return fmt.Errorf("os.MkdirAll(%s):%s", rootPath, err.Error()) - } - } - - return nil -} - -/* -generate file path for given url - -Params: - - url: url to crawl - - rootPath: root path for saving file - -Returns: - - file path -*/ -func genFilePath(urlStr, rootPath string) string { - filePath := url.QueryEscape(urlStr) - filePath = path.Join(rootPath, filePath) - return filePath -} - -/* -save web page of given url to file - -Params: - - rootPath: root path for saving file - - url: url to crawl - - data: data to save -Returns: - - error: any failure -*/ -func saveWebPage(rootPath string, url string, data []byte) error { - // create root dir, if not exist - if err := genRootDir(rootPath); err != nil { - return fmt.Errorf("genRootDir(%s):%s", rootPath, err.Error()) - } - - // generate full file path - filePath := genFilePath(url, rootPath) - - // save to file - err := ioutil.WriteFile(filePath, data, OutputFileMode) - if err != nil { - return fmt.Errorf("ioutil.WriteFile(%s):%s", filePath, err.Error()) - } - - return nil -} diff --git a/src/mini_spider/webpage_save_test.go b/src/mini_spider/webpage_save_test.go deleted file mode 100644 index 13c7255..0000000 --- a/src/mini_spider/webpage_save_test.go +++ /dev/null @@ -1,38 +0,0 @@ -/* webpage_save_test.go - test for webpage_save.go */ -/* -modification history --------------------- -2017/07/22, by Xiongmin LIN, create -*/ -/* -DESCRIPTION -*/ -package mini_spider - -import ( - "testing" -) - -// test for genFilePath() -func TestGenFilePath(t *testing.T) { - rootPath := "./output" - url := "www.baidu.com" - - filePath := genFilePath(url, rootPath) - if filePath != "./output/www.baidu.com" { - t.Errorf("err in genFilePath(), filePath=%s", filePath) - } -} - -// test for saveWebPage() -func TestSaveWebPage(t *testing.T) { - rootPath := "./output" - url := "www.baidu.com" - data := []byte("this is a test") - - err := saveWebPage(rootPath, url, data) - if err != nil { - t.Errorf("err in saveWebPage(%s, %s):%s", rootPath, url, err.Error()) - } -} - diff --git a/src/web_package/webpage_parse_test.go b/src/web_package/webpage_parse_test.go deleted file mode 100644 index 13a538e..0000000 --- a/src/web_package/webpage_parse_test.go +++ /dev/null @@ -1,34 +0,0 @@ -/* webpage_parse_test.go - test for webpage_parse.go */ -/* -modification history --------------------- -2017/07/22, by Xiongmin LIN, create -*/ -/* -DESCRIPTION -*/ -package web_package - -import ( - "testing" -) - -// test for parseWebPage() -func TestParseWebPage(t *testing.T) { - s := []byte(`

Links:

`) - - links, err := parseWebPage(s, "http://www.baidu.com/a/b.html") - if err != nil { - t.Errorf("err in parseWebPage():%s", err.Error()) - return - } - - if len(links) != 2 { - t.Errorf("len(links) should be 2, now it's %d", len(links)) - return - } - - if links[0] != "http://www.baidu.com/a/test" || links[1] != "http://www.baidu.com/test1/test2" { - t.Errorf("links:%s", links) - } -} diff --git a/start.sh b/start.sh old mode 100644 new mode 100755 index 11cc8f4..92e732d --- a/start.sh +++ b/start.sh @@ -1,4 +1,4 @@ #!/bin/bash -cd output/bin -./mini_spider -c ../../conf/spider.conf -l ../log/ -s -d +cd output/ +./mini_spider -c ../conf/spider.conf -l ./log/ -s -d diff --git a/src/mini_spider/test_data/seedfile/url.data b/test/seedfile/url.data similarity index 100% rename from src/mini_spider/test_data/seedfile/url.data rename to test/seedfile/url.data diff --git a/src/mini_spider_config/test_data/spider.conf b/test/spider.conf similarity index 80% rename from src/mini_spider_config/test_data/spider.conf rename to test/spider.conf index 169007b..a0979ac 100644 --- a/src/mini_spider_config/test_data/spider.conf +++ b/test/spider.conf @@ -1,9 +1,9 @@ [Basic] # 种子文件路径 -urlListFile = ../data/url.data +urlListFile = ../../test/seedfile/url.data # 抓取结果存储目录 -outputDirectory = ../webpage +outputDirectory = ../../test/webpage # 最大抓取深度(种子为0级) maxDepth = 2 @@ -12,7 +12,7 @@ maxDepth = 2 crawlInterval = 1 # 抓取超时. 单位: 秒 -crawlTimeout = 1 +crawlTimeout = 3 # 需要存储的目标网页URL pattern(正则表达式) targetUrl = .*.(htm|html)$