Skip to content

Commit

Permalink
Code refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
jacksontong committed May 23, 2022
1 parent e7d8836 commit 9688c5c
Show file tree
Hide file tree
Showing 37 changed files with 413 additions and 526 deletions.
41 changes: 41 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,42 @@
# OSX leaves these everywhere on SMB shares
._*

# OSX trash
.DS_Store

# Eclipse files
.classpath
.project
.settings/**

# Files generated by JetBrains IDEs, e.g. IntelliJ IDEA
.idea/
*.iml
.bin/

# Vscode files
.vscode

# This is where the result of the go build goes
/output*/
/_output*/
/_output

# Emacs save files
*~
\#*\#
.\#*

# Vim-related files
[._]*.s[a-w][a-z]
[._]s[a-w][a-z]
*.un~
Session.vim
.netrwhist

# Generate by makefile
bin
node_modules
package.json
yarn.lock

48 changes: 48 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# init project path
HOMEDIR := $(shell pwd)
OUTDIR := $(HOMEDIR)/output
BIN := mini_spider

VERSION=`git describe --always`
BUILDTIME=`date +%FT%T%z`
GOVERSION=`go version`

# init command params
GO := go
GOOS := linux
GOBUILD := $(GO) build
GOTEST := $(GO) test
GOPKGS := $$($(GO) list ./pkg... | grep -vE "vendor")

# Setup the -ldflags option for go build here, interpolate the variable values
LDFLAGS=-ldflags "-w -s -X main.Version=${VERSION} -X 'main.BuildTime=${BUILDTIME}' -X 'main.GoVersion=${GOVERSION}'"

# make, make all
all: prepare compile package

# make prepare
prepare:

# make compile, go build
compile: build
build:
GOOS=$(GOOS) $(GOBUILD) ${LDFLAGS} -o $(HOMEDIR)/$(BIN) cmd/mini_spider.go

# make test, test your code
test: test-case
test-case:
$(GOTEST) -v -cover $(GOPKGS)
rm -r test/webpage

# make package
package: package-bin
package-bin:
mkdir -p $(OUTDIR)
mv $(HOMEDIR)/$(BIN) $(OUTDIR)/

# make clean
clean:
rm -rf $(OUTDIR)

# avoid filename conflict and speed up build
.PHONY: all prepare compile test package clean build
2 changes: 1 addition & 1 deletion README
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
在调研过程中,经常需要对一些网站进行定向抓取。由于go包含各种强大的库,使用go做定向抓取比较简单。请使用go开发一个迷你定向抓取器mini_spider,实现对种子链接的抓取,并把URL长相符合特定pattern的网页保存到磁盘上。

[程序运行]:
./mini_spider -c ../conf -l ../log
./mini_spider -c ../conf/spider.conf -l ./log

[配置文件spider.conf]:
[spider]
Expand Down
78 changes: 0 additions & 78 deletions build.sh

This file was deleted.

47 changes: 18 additions & 29 deletions src/main/mini_spider.go → cmd/mini_spider.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,25 @@ import (
"os/signal"
"syscall"
"time"
)


import (
"code.google.com/p/log4go"
"www.baidu.com/golang-lib/log"
)
"github.com/baidu/go-lib/log"

import (
"mini_spider_config"
"mini_spider"
mini_spider_config "github.com/cumirror/mini-spider/pkg/config"
"github.com/cumirror/mini-spider/pkg/seed"
mini_spider "github.com/cumirror/mini-spider/pkg/spider"
)

var (
confPath *string = flag.String("c", "../conf/spider.conf", "mini_spider configure path")
help *bool = flag.Bool("h", false, "show help")
logPath *string = flag.String("l", "../log", "dir path of log")
showVer *bool = flag.Bool("v", false, "show version")
stdOut *bool = flag.Bool("s", false, "to show log in stdout")
debugLog *bool = flag.Bool("d", false, "to show debug log (otherwise >= info)")
confPath *string = flag.String("c", "../conf/spider.conf", "mini_spider configure path")
help *bool = flag.Bool("h", false, "show help")
logPath *string = flag.String("l", "../log", "dir path of log")
showVer *bool = flag.Bool("v", false, "show version")
stdOut *bool = flag.Bool("s", false, "to show log in stdout")
debugLog *bool = flag.Bool("d", false, "to show debug log (otherwise >= info)")
)

var Version, BuildTime, GoVersion string

func Exit(code int) {
log.Logger.Close()
/* to overcome bug in log, sleep for a while */
Expand All @@ -57,7 +54,7 @@ func main() {
return
}
if *showVer {
fmt.Printf("version is: 1.0.0\n")
fmt.Println("Version:", Version, "Build:", BuildTime, "Go:", GoVersion)
return
}

Expand All @@ -69,14 +66,6 @@ func main() {
}
fmt.Printf("mini_spider starts...\n")

/* initialize log */
/* set log buffer size */
log4go.SetLogBufferLength(10000)
/* if blocking, log will be dropped */
log4go.SetLogWithBlocking(false)
/* we want to get state of log4go */
log4go.SetWithModuleState(true)

err := log.Init("mini_spider", logSwitch, *logPath, *stdOut, "midnight", 5)
if err != nil {
fmt.Printf("main(): err in log.Init():%s\n", err.Error())
Expand All @@ -91,14 +80,14 @@ func main() {
}

// load seeds
seeds, err := mini_spider.LoadSeedFile(config.Basic.UrlListFile)
seeds, err := seed.LoadSeedFile(config.Basic.UrlListFile)
if err != nil {
log.Logger.Error("main():err in loadSeedFile(%s):%s", config.Basic.UrlListFile, err.Error())
Exit(1)
}

// create mini-spider
miniSpider, err:= mini_spider.NewMiniSpider(&config, seeds)
miniSpider, err := mini_spider.NewMiniSpider(&config, seeds)
if err != nil {
log.Logger.Error("main():err in NewMiniSpider():%s", err.Error())
Exit(1)
Expand All @@ -107,7 +96,7 @@ func main() {
// run mini-spider
miniSpider.Run()

// waiting for all tasks to finish.
// waiting for all tasks to finish.
go func() {
for {
if miniSpider.GetUnfinished() == 0 {
Expand All @@ -120,8 +109,8 @@ func main() {
// sleep for a while
time.Sleep(5 * time.Second)
}
} ()
}()

// Handle SIGINT and SIGTERM.
ch := make(chan os.Signal)
signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM)
Expand Down
4 changes: 2 additions & 2 deletions conf/spider.conf
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[Basic]
# 种子文件路径
urlListFile = ../data/url.data
urlListFile = ../conf/url.data

# 抓取结果存储目录
outputDirectory = ../webpage
outputDirectory = ./webpage

# 最大抓取深度(种子为0级)
maxDepth = 2
Expand Down
File renamed without changes.
14 changes: 14 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
module github.com/cumirror/mini-spider

go 1.18

require (
github.com/baidu/go-lib v0.0.0-20210902034828-42829d4bdecd
golang.org/x/net v0.0.0-20220520000938-2e3eb7b945c2
gopkg.in/gcfg.v1 v1.2.3
)

require (
github.com/jehiah/go-strftime v0.0.0-20171201141054-1d33003b3869 // indirect
gopkg.in/warnings.v0 v0.1.2 // indirect
)
10 changes: 10 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
github.com/baidu/go-lib v0.0.0-20210902034828-42829d4bdecd h1:AQ/hNi9wwj7mm06Zwycxp++X3cKKMAz8mxjNYSIaj8U=
github.com/baidu/go-lib v0.0.0-20210902034828-42829d4bdecd/go.mod h1:FneHDqz3wLeDGdWfRyW4CzBbCwaqesLGIFb09N80/ww=
github.com/jehiah/go-strftime v0.0.0-20171201141054-1d33003b3869 h1:IPJ3dvxmJ4uczJe5YQdrYB16oTJlGSC/OyZDqUk9xX4=
github.com/jehiah/go-strftime v0.0.0-20171201141054-1d33003b3869/go.mod h1:cJ6Cj7dQo+O6GJNiMx+Pa94qKj+TG8ONdKHgMNIyyag=
golang.org/x/net v0.0.0-20220520000938-2e3eb7b945c2 h1:NWy5+hlRbC7HK+PmcXVUmW1IMyFce7to56IUvhUFm7Y=
golang.org/x/net v0.0.0-20220520000938-2e3eb7b945c2/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
gopkg.in/gcfg.v1 v1.2.3 h1:m8OOJ4ccYHnx2f4gQwpno8nAX5OGOh7RLaaz0pj3Ogs=
gopkg.in/gcfg.v1 v1.2.3/go.mod h1:yesOnuUOFQAhST5vPY4nbZsb/huCgGGXlipJsBn0b3o=
gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME=
gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI=
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ modification history
DESCRIPTION
*/

package mini_spider_config
package config

import (
"fmt"
Expand Down
6 changes: 2 additions & 4 deletions src/mini_spider_config/config.go → pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,12 @@ modification history
DESCRIPTION
*/

package mini_spider_config
package config

import (
"fmt"
)

import (
"code.google.com/p/gcfg"
"gopkg.in/gcfg.v1"
)

type MiniSpiderConf struct {
Expand Down
Loading

0 comments on commit 9688c5c

Please sign in to comment.