Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into ytdlp
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Sep 8, 2024
2 parents 578fb6b + 71f8b3f commit eea0270
Show file tree
Hide file tree
Showing 29 changed files with 819 additions and 441 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ jobs:
name: release linux/amd64
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: wangyoucao577/go-release-action@af4a9db7b0ee3cf602cb75541d72bf568a99da4f #release/rel-v1.42
- uses: actions/checkout@v5
- uses: wangyoucao577/go-release-action@6ac7dba1f9e61850053324549cb6bc88e4b473d2 #release/rel-v1.51
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
goos: linux
goarch: amd64
goversion: 1.21.6
goversion: 1.23.0
compress_assets: OFF
md5sum: FALSE
sha256sum: TRUE
asset_name: 'Zeno-linux-amd64'
asset_name: 'Zeno-linux-amd64'
6 changes: 3 additions & 3 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4

- name: Set up Go
uses: actions/setup-go@v2
uses: actions/setup-go@v5
with:
go-version: '1.22.2'
go-version: '1.23'

- name: Build
run: go build -v ./...
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ Zeno
*.sh
zeno.log
.vscode/
*.py
*.py
.DS_Store
7 changes: 5 additions & 2 deletions cmd/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ func getCMDsFlags(getCmd *cobra.Command) {
getCmd.PersistentFlags().String("cookies", "", "File containing cookies that will be used for requests.")
getCmd.PersistentFlags().Bool("keep-cookies", false, "Keep a global cookie jar")
getCmd.PersistentFlags().Bool("headless", false, "Use headless browsers instead of standard GET requests.")
getCmd.PersistentFlags().Bool("local-seencheck", false, "Simple local seencheck to avoid re-crawling of URIs.")
getCmd.PersistentFlags().Bool("disable-seencheck", false, "Disable the (remote or local) seencheck that avoid re-crawling of URIs.")
getCmd.PersistentFlags().Bool("json", false, "Output logs in JSON")
getCmd.PersistentFlags().Bool("debug", false, "")
getCmd.PersistentFlags().Bool("api", false, "Enable API")
Expand All @@ -56,7 +56,7 @@ func getCMDsFlags(getCmd *cobra.Command) {
getCmd.PersistentFlags().StringSlice("exclude-string", []string{}, "Discard any (discovered) URLs containing this string.")
getCmd.PersistentFlags().Bool("random-local-ip", false, "Use random local IP for requests. (will be ignored if a proxy is set)")
getCmd.PersistentFlags().Int("min-space-required", 20, "Minimum space required in GB to continue the crawl.")
getCmd.PersistentFlags().Bool("no-handover", false, "Disable the handover mechanism that dispatch URLs via a buffer before enqueuing on disk.")
getCmd.PersistentFlags().Bool("handover", false, "Use the handover mechanism that dispatch URLs via a buffer before enqueuing on disk. (UNSTABLE)")
getCmd.PersistentFlags().Bool("ultrasafe-queue", false, "Don't use committed batch writes to the WAL and instead fsync() after each write.")

// Proxy flags
Expand Down Expand Up @@ -88,6 +88,9 @@ func getCMDsFlags(getCmd *cobra.Command) {
getCmd.PersistentFlags().Bool("no-ytdlp", false, "Disable youtube-dlp usage for video extraction.")
getCmd.PersistentFlags().String("ytdlp-path", "", "Path to youtube-dlp binary.")

// Pyroscope profiling flag
getCmd.PersistentFlags().String("pyroscope-address", "", "Pyroscope server address for profiling. Will enable the Pyroscope profiling suite when added.")

// Alias support
// As cobra doesn't support aliases natively (couldn't find a way to do it), we have to do it manually
// This is a workaround to allow users to use `--hops` instead of `--max-hops` for example
Expand Down
5 changes: 3 additions & 2 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ type Config struct {
ElasticSearchUsername string `mapstructure:"es-user"`
ElasticSearchPassword string `mapstructure:"es-password"`
ElasticSearchIndexPrefix string `mapstructure:"es-index-prefix"`
PyroscopeAddress string `mapstructure:"pyroscope-address"`
DisableHTMLTag []string `mapstructure:"disable-html-tag"`
ExcludeHosts []string `mapstructure:"exclude-host"`
IncludeHosts []string `mapstructure:"include-host"`
Expand All @@ -57,7 +58,7 @@ type Config struct {
HQBatchSize int64 `mapstructure:"hq-batch-size"`
KeepCookies bool `mapstructure:"keep-cookies"`
Headless bool `mapstructure:"headless"`
LocalSeencheck bool `mapstructure:"local-seencheck"`
DisableSeencheck bool `mapstructure:"disable-seencheck"`
JSON bool `mapstructure:"json"`
Debug bool `mapstructure:"debug"`
LiveStats bool `mapstructure:"live-stats"`
Expand All @@ -74,8 +75,8 @@ type Config struct {
HQContinuousPull bool `mapstructure:"hq-continuous-pull"`
HQRateLimitSendBack bool `mapstructure:"hq-rate-limiting-send-back"`
NoStdoutLogging bool `mapstructure:"no-stdout-log"`
NoHandover bool `mapstructure:"no-handover"`
NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"`
Handover bool `mapstructure:"handover"`

// Dependencies
NoYTDLP bool `mapstructure:"no-ytdlp"`
Expand Down
50 changes: 25 additions & 25 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
module github.com/internetarchive/Zeno

go 1.22.4
go 1.23.1

require (
git.archive.org/wb/gocrawlhq v1.2.5
github.com/CorentinB/warc v0.8.40
git.archive.org/wb/gocrawlhq v1.2.7
github.com/CorentinB/warc v0.8.44
github.com/PuerkitoBio/goquery v1.9.2
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2
github.com/clbanning/mxj/v2 v2.7.0
Expand All @@ -13,19 +13,20 @@ require (
github.com/google/uuid v1.6.0
github.com/gosuri/uilive v0.0.4
github.com/gosuri/uitable v0.0.4
github.com/grafana/pyroscope-go v1.1.2
github.com/paulbellamy/ratecounter v0.2.0
github.com/philippgille/gokv/leveldb v0.7.0
github.com/prometheus/client_golang v1.19.1
github.com/prometheus/client_golang v1.20.2
github.com/remeh/sizedwaitgroup v1.0.0
github.com/sirupsen/logrus v1.9.3
github.com/spf13/afero v1.11.0
github.com/spf13/cobra v1.8.0
github.com/spf13/cobra v1.8.1
github.com/spf13/pflag v1.0.5
github.com/spf13/viper v1.19.0
github.com/stretchr/testify v1.9.0
github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5
go.uber.org/goleak v1.3.0
golang.org/x/net v0.27.0
golang.org/x/net v0.28.0
google.golang.org/protobuf v1.34.2
mvdan.cc/xurls/v2 v2.5.0
)
Expand All @@ -35,7 +36,7 @@ require (
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cloudflare/circl v1.3.9 // indirect
github.com/cloudflare/circl v1.4.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/elastic/elastic-transport-go/v8 v8.6.0 // indirect
github.com/fatih/color v1.17.0 // indirect
Expand All @@ -47,46 +48,45 @@ require (
github.com/gobwas/ws v1.4.0 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/gomodule/redigo v1.9.2 // indirect
github.com/grafana/pyroscope-go/godeltaprof v0.1.8 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.9 // indirect
github.com/klauspost/pgzip v1.2.6 // indirect
github.com/magiconair/properties v1.8.7 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/mattn/go-runewidth v0.0.16 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/onsi/gomega v1.27.6 // indirect
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
github.com/philippgille/gokv/encoding v0.7.0 // indirect
github.com/philippgille/gokv/util v0.7.0 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/common v0.57.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/refraction-networking/utls v1.6.6 // indirect
github.com/refraction-networking/utls v1.6.7 // indirect
github.com/rivo/uniseg v0.4.7 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
github.com/sagikazarmark/locafero v0.6.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
github.com/satori/go.uuid v1.2.0 // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
github.com/spf13/cast v1.6.0 // indirect
github.com/spf13/cast v1.7.0 // indirect
github.com/subosito/gotenv v1.6.0 // indirect
github.com/syndtr/goleveldb v1.0.0 // indirect
go.opentelemetry.io/otel v1.28.0 // indirect
go.opentelemetry.io/otel/metric v1.28.0 // indirect
go.opentelemetry.io/otel/trace v1.28.0 // indirect
go.uber.org/atomic v1.9.0 // indirect
go.uber.org/multierr v1.9.0 // indirect
golang.org/x/crypto v0.25.0 // indirect
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.22.0 // indirect
golang.org/x/text v0.16.0 // indirect
github.com/ulikunitz/xz v0.5.12 // indirect
go.opentelemetry.io/otel v1.29.0 // indirect
go.opentelemetry.io/otel/metric v1.29.0 // indirect
go.opentelemetry.io/otel/trace v1.29.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/crypto v0.26.0 // indirect
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948 // indirect
golang.org/x/sync v0.8.0 // indirect
golang.org/x/sys v0.25.0 // indirect
golang.org/x/text v0.17.0 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
Loading

0 comments on commit eea0270

Please sign in to comment.