Skip to content

Commit

Permalink
golang: mini-spider
Browse files Browse the repository at this point in the history
  • Loading branch information
linxiongmin committed Apr 18, 2018
1 parent 6d4df1e commit e7d8836
Show file tree
Hide file tree
Showing 31 changed files with 1,543 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.DS_Store
39 changes: 39 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
[背景]:
在调研过程中,经常需要对一些网站进行定向抓取。由于go包含各种强大的库,使用go做定向抓取比较简单。请使用go开发一个迷你定向抓取器mini_spider,实现对种子链接的抓取,并把URL长相符合特定pattern的网页保存到磁盘上。

[程序运行]:
./mini_spider -c ../conf -l ../log

[配置文件spider.conf]:
[spider]
# 种子文件路径
urlListFile = ../data/url.data
# 抓取结果存储目录
outputDirectory = ../output
# 最大抓取深度(种子为0级)
maxDepth = 1
# 抓取间隔. 单位: 秒
crawlInterval = 1
# 抓取超时. 单位: 秒
crawlTimeout = 1
# 需要存储的目标网页URL pattern(正则表达式)
targetUrl = .*.(htm|html)$
# 抓取routine数
threadCount = 8

[种子文件为json格式,示例如下]:
[
http://www.baidu.com,
http://www.sina.com.cn,
...
]

[要求和注意事项]:
1. 需要支持命令行参数处理。具体包含: -h(帮助)、-v(版本)、-c(配置文件路径)、-l(日志文件路径,2个日志:mini_spider.log和mini_spider.wf.log)
2. 抓取网页的顺序没有限制
3. 单个网页抓取或解析失败,不能导致整个程序退出。需要在日志中记录下错误原因并继续。
4. 当程序完成所有抓取任务后,必须优雅退出
5. 从HTML提取链接时需要处理相对路径和绝对路径
6. 需要能够处理不同字符编码的网页(例如utf-8或gbk)
7. 网页存储时每个网页单独存为一个文件,以URL为文件名。注意对URL中的特殊字符,需要做转义
8. 要求支持多routine并行抓取(注意:这里并不是指简单设置GOMAXPROCS>1)
1 change: 0 additions & 1 deletion README.md

This file was deleted.

78 changes: 78 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/bash

### dir structure
# /bfe
# /bfe-common
# /go
# /output
# /golang-lib
# /mini_spider
# build.sh

### restore working dir
WORKROOT=$(pwd)

cd ${WORKROOT}

# prepare PATH, GOROOT and GOPATH
export GOPATH=$(pwd)

# export golang-lib to GOPATH
cd ${WORKROOT}
export GOPATH=$(pwd)/../bfe-common/golang-lib:$GOPATH

# run go test for all subdirectory
cd ${WORKROOT}/src/mini_spider
go test -c -o ./testRun
if [ $? -ne 0 ];
then
echo "go compile test failed"
exit 1
fi

go test -run testRun
if [ $? -ne 0 ];
then
echo "go run test failed"
exit 1
fi
rm -rf ./testRun
echo "OK for go test"

### build
cd ${WORKROOT}/src/main
go build -o mini_spider
if [ $? -ne 0 ];
then
echo "fail to go build mini_spider.go"
exit 1
fi
echo "OK for go build mini_spider.go"

### create directory for output
cd ../../
if [ -d "./output" ]
then
rm -rf output
fi
mkdir output

# copy config
mkdir output/conf
cp conf/spider.conf output/conf

# copy data
cp -r data output/

# copy file to bin
mkdir output/bin
mv src/main/mini_spider output/bin

# create dir for log
mkdir output/log

# change mode of files in /bin
chmod +x output/bin/mini_spider


echo "OK for build mini_spider"
27 changes: 27 additions & 0 deletions conf/spider.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[Basic]
# 种子文件路径
urlListFile = ../data/url.data

# 抓取结果存储目录
outputDirectory = ../webpage

# 最大抓取深度(种子为0级)
maxDepth = 2

# 抓取间隔. 单位: 秒
crawlInterval = 1

# 抓取超时. 单位: 秒
crawlTimeout = 1

# 需要存储的目标网页URL pattern(正则表达式)
targetUrl = .*.(htm|html)$

# 抓取routine数
threadCount = 8

# 优雅退出时间. 单位: 秒,最高60秒
GracefulShutdownTimeout = 5



3 changes: 3 additions & 0 deletions data/url.data
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[
"http://www.sina.com.cn/"
]
Binary file added src/.DS_Store
Binary file not shown.
133 changes: 133 additions & 0 deletions src/main/mini_spider.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
/* mini_spider.go - program entry point */
/*
modification history
--------------------
2017/07/20, by Xiongmin LIN, create
*/
/*
DESCRIPTION
mini spider
*/

package main

import (
"flag"
"fmt"
"os"
"os/signal"
"syscall"
"time"
)


import (
"code.google.com/p/log4go"
"www.baidu.com/golang-lib/log"
)

import (
"mini_spider_config"
"mini_spider"
)

var (
confPath *string = flag.String("c", "../conf/spider.conf", "mini_spider configure path")
help *bool = flag.Bool("h", false, "show help")
logPath *string = flag.String("l", "../log", "dir path of log")
showVer *bool = flag.Bool("v", false, "show version")
stdOut *bool = flag.Bool("s", false, "to show log in stdout")
debugLog *bool = flag.Bool("d", false, "to show debug log (otherwise >= info)")
)

func Exit(code int) {
log.Logger.Close()
/* to overcome bug in log, sleep for a while */
time.Sleep(1 * time.Second)
os.Exit(code)
}

/* the main function */
func main() {
var logSwitch string

flag.Parse()
if *help {
flag.PrintDefaults()
return
}
if *showVer {
fmt.Printf("version is: 1.0.0\n")
return
}

// debug switch
if *debugLog {
logSwitch = "DEBUG"
} else {
logSwitch = "INFO"
}
fmt.Printf("mini_spider starts...\n")

/* initialize log */
/* set log buffer size */
log4go.SetLogBufferLength(10000)
/* if blocking, log will be dropped */
log4go.SetLogWithBlocking(false)
/* we want to get state of log4go */
log4go.SetWithModuleState(true)

err := log.Init("mini_spider", logSwitch, *logPath, *stdOut, "midnight", 5)
if err != nil {
fmt.Printf("main(): err in log.Init():%s\n", err.Error())
os.Exit(-1)
}

// load config
config, err := mini_spider_config.LoadConfig(*confPath)
if err != nil {
log.Logger.Error("main():err in ConfigLoad():%s", err.Error())
Exit(-1)
}

// load seeds
seeds, err := mini_spider.LoadSeedFile(config.Basic.UrlListFile)
if err != nil {
log.Logger.Error("main():err in loadSeedFile(%s):%s", config.Basic.UrlListFile, err.Error())
Exit(1)
}

// create mini-spider
miniSpider, err:= mini_spider.NewMiniSpider(&config, seeds)
if err != nil {
log.Logger.Error("main():err in NewMiniSpider():%s", err.Error())
Exit(1)
}

// run mini-spider
miniSpider.Run()

// waiting for all tasks to finish.
go func() {
for {
if miniSpider.GetUnfinished() == 0 {
log.Logger.Info("All task finished, quit")
Exit(0)
}

log.Logger.Debug("Waiting for %d tasks to finish\n", miniSpider.GetUnfinished())

// sleep for a while
time.Sleep(5 * time.Second)
}
} ()

// Handle SIGINT and SIGTERM.
ch := make(chan os.Signal)
signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM)
<-ch

// ensure that all logs are export and normal exit
Exit(0)

}
Loading

0 comments on commit e7d8836

Please sign in to comment.