forked from XiongminLin/mini-spider-golang
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
linxiongmin
committed
Apr 18, 2018
1 parent
6d4df1e
commit e7d8836
Showing
31 changed files
with
1,543 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
[背景]: | ||
在调研过程中,经常需要对一些网站进行定向抓取。由于go包含各种强大的库,使用go做定向抓取比较简单。请使用go开发一个迷你定向抓取器mini_spider,实现对种子链接的抓取,并把URL长相符合特定pattern的网页保存到磁盘上。 | ||
|
||
[程序运行]: | ||
./mini_spider -c ../conf -l ../log | ||
|
||
[配置文件spider.conf]: | ||
[spider] | ||
# 种子文件路径 | ||
urlListFile = ../data/url.data | ||
# 抓取结果存储目录 | ||
outputDirectory = ../output | ||
# 最大抓取深度(种子为0级) | ||
maxDepth = 1 | ||
# 抓取间隔. 单位: 秒 | ||
crawlInterval = 1 | ||
# 抓取超时. 单位: 秒 | ||
crawlTimeout = 1 | ||
# 需要存储的目标网页URL pattern(正则表达式) | ||
targetUrl = .*.(htm|html)$ | ||
# 抓取routine数 | ||
threadCount = 8 | ||
|
||
[种子文件为json格式,示例如下]: | ||
[ | ||
http://www.baidu.com, | ||
http://www.sina.com.cn, | ||
... | ||
] | ||
|
||
[要求和注意事项]: | ||
1. 需要支持命令行参数处理。具体包含: -h(帮助)、-v(版本)、-c(配置文件路径)、-l(日志文件路径,2个日志:mini_spider.log和mini_spider.wf.log) | ||
2. 抓取网页的顺序没有限制 | ||
3. 单个网页抓取或解析失败,不能导致整个程序退出。需要在日志中记录下错误原因并继续。 | ||
4. 当程序完成所有抓取任务后,必须优雅退出 | ||
5. 从HTML提取链接时需要处理相对路径和绝对路径 | ||
6. 需要能够处理不同字符编码的网页(例如utf-8或gbk) | ||
7. 网页存储时每个网页单独存为一个文件,以URL为文件名。注意对URL中的特殊字符,需要做转义 | ||
8. 要求支持多routine并行抓取(注意:这里并不是指简单设置GOMAXPROCS>1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#!/usr/bin/bash | ||
|
||
### dir structure | ||
# /bfe | ||
# /bfe-common | ||
# /go | ||
# /output | ||
# /golang-lib | ||
# /mini_spider | ||
# build.sh | ||
|
||
### restore working dir | ||
WORKROOT=$(pwd) | ||
|
||
cd ${WORKROOT} | ||
|
||
# prepare PATH, GOROOT and GOPATH | ||
export GOPATH=$(pwd) | ||
|
||
# export golang-lib to GOPATH | ||
cd ${WORKROOT} | ||
export GOPATH=$(pwd)/../bfe-common/golang-lib:$GOPATH | ||
|
||
# run go test for all subdirectory | ||
cd ${WORKROOT}/src/mini_spider | ||
go test -c -o ./testRun | ||
if [ $? -ne 0 ]; | ||
then | ||
echo "go compile test failed" | ||
exit 1 | ||
fi | ||
|
||
go test -run testRun | ||
if [ $? -ne 0 ]; | ||
then | ||
echo "go run test failed" | ||
exit 1 | ||
fi | ||
rm -rf ./testRun | ||
echo "OK for go test" | ||
|
||
### build | ||
cd ${WORKROOT}/src/main | ||
go build -o mini_spider | ||
if [ $? -ne 0 ]; | ||
then | ||
echo "fail to go build mini_spider.go" | ||
exit 1 | ||
fi | ||
echo "OK for go build mini_spider.go" | ||
|
||
### create directory for output | ||
cd ../../ | ||
if [ -d "./output" ] | ||
then | ||
rm -rf output | ||
fi | ||
mkdir output | ||
|
||
# copy config | ||
mkdir output/conf | ||
cp conf/spider.conf output/conf | ||
|
||
# copy data | ||
cp -r data output/ | ||
|
||
# copy file to bin | ||
mkdir output/bin | ||
mv src/main/mini_spider output/bin | ||
|
||
# create dir for log | ||
mkdir output/log | ||
|
||
# change mode of files in /bin | ||
chmod +x output/bin/mini_spider | ||
|
||
|
||
echo "OK for build mini_spider" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
[Basic] | ||
# 种子文件路径 | ||
urlListFile = ../data/url.data | ||
|
||
# 抓取结果存储目录 | ||
outputDirectory = ../webpage | ||
|
||
# 最大抓取深度(种子为0级) | ||
maxDepth = 2 | ||
|
||
# 抓取间隔. 单位: 秒 | ||
crawlInterval = 1 | ||
|
||
# 抓取超时. 单位: 秒 | ||
crawlTimeout = 1 | ||
|
||
# 需要存储的目标网页URL pattern(正则表达式) | ||
targetUrl = .*.(htm|html)$ | ||
|
||
# 抓取routine数 | ||
threadCount = 8 | ||
|
||
# 优雅退出时间. 单位: 秒,最高60秒 | ||
GracefulShutdownTimeout = 5 | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[ | ||
"http://www.sina.com.cn/" | ||
] |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
/* mini_spider.go - program entry point */ | ||
/* | ||
modification history | ||
-------------------- | ||
2017/07/20, by Xiongmin LIN, create | ||
*/ | ||
/* | ||
DESCRIPTION | ||
mini spider | ||
*/ | ||
|
||
package main | ||
|
||
import ( | ||
"flag" | ||
"fmt" | ||
"os" | ||
"os/signal" | ||
"syscall" | ||
"time" | ||
) | ||
|
||
|
||
import ( | ||
"code.google.com/p/log4go" | ||
"www.baidu.com/golang-lib/log" | ||
) | ||
|
||
import ( | ||
"mini_spider_config" | ||
"mini_spider" | ||
) | ||
|
||
var ( | ||
confPath *string = flag.String("c", "../conf/spider.conf", "mini_spider configure path") | ||
help *bool = flag.Bool("h", false, "show help") | ||
logPath *string = flag.String("l", "../log", "dir path of log") | ||
showVer *bool = flag.Bool("v", false, "show version") | ||
stdOut *bool = flag.Bool("s", false, "to show log in stdout") | ||
debugLog *bool = flag.Bool("d", false, "to show debug log (otherwise >= info)") | ||
) | ||
|
||
func Exit(code int) { | ||
log.Logger.Close() | ||
/* to overcome bug in log, sleep for a while */ | ||
time.Sleep(1 * time.Second) | ||
os.Exit(code) | ||
} | ||
|
||
/* the main function */ | ||
func main() { | ||
var logSwitch string | ||
|
||
flag.Parse() | ||
if *help { | ||
flag.PrintDefaults() | ||
return | ||
} | ||
if *showVer { | ||
fmt.Printf("version is: 1.0.0\n") | ||
return | ||
} | ||
|
||
// debug switch | ||
if *debugLog { | ||
logSwitch = "DEBUG" | ||
} else { | ||
logSwitch = "INFO" | ||
} | ||
fmt.Printf("mini_spider starts...\n") | ||
|
||
/* initialize log */ | ||
/* set log buffer size */ | ||
log4go.SetLogBufferLength(10000) | ||
/* if blocking, log will be dropped */ | ||
log4go.SetLogWithBlocking(false) | ||
/* we want to get state of log4go */ | ||
log4go.SetWithModuleState(true) | ||
|
||
err := log.Init("mini_spider", logSwitch, *logPath, *stdOut, "midnight", 5) | ||
if err != nil { | ||
fmt.Printf("main(): err in log.Init():%s\n", err.Error()) | ||
os.Exit(-1) | ||
} | ||
|
||
// load config | ||
config, err := mini_spider_config.LoadConfig(*confPath) | ||
if err != nil { | ||
log.Logger.Error("main():err in ConfigLoad():%s", err.Error()) | ||
Exit(-1) | ||
} | ||
|
||
// load seeds | ||
seeds, err := mini_spider.LoadSeedFile(config.Basic.UrlListFile) | ||
if err != nil { | ||
log.Logger.Error("main():err in loadSeedFile(%s):%s", config.Basic.UrlListFile, err.Error()) | ||
Exit(1) | ||
} | ||
|
||
// create mini-spider | ||
miniSpider, err:= mini_spider.NewMiniSpider(&config, seeds) | ||
if err != nil { | ||
log.Logger.Error("main():err in NewMiniSpider():%s", err.Error()) | ||
Exit(1) | ||
} | ||
|
||
// run mini-spider | ||
miniSpider.Run() | ||
|
||
// waiting for all tasks to finish. | ||
go func() { | ||
for { | ||
if miniSpider.GetUnfinished() == 0 { | ||
log.Logger.Info("All task finished, quit") | ||
Exit(0) | ||
} | ||
|
||
log.Logger.Debug("Waiting for %d tasks to finish\n", miniSpider.GetUnfinished()) | ||
|
||
// sleep for a while | ||
time.Sleep(5 * time.Second) | ||
} | ||
} () | ||
|
||
// Handle SIGINT and SIGTERM. | ||
ch := make(chan os.Signal) | ||
signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM) | ||
<-ch | ||
|
||
// ensure that all logs are export and normal exit | ||
Exit(0) | ||
|
||
} |
Oops, something went wrong.