Skip to content

Commit

Permalink
feat: enhance Wikipedia image provider with rate limiting and retry l…
Browse files Browse the repository at this point in the history
…ogic

- Added rate limiting to the wikiMediaProvider to control API request frequency, allowing up to 10 requests per second with a burst capacity of 10.
- Implemented a retry mechanism for API queries, allowing up to 3 attempts with exponential backoff on failure.
- Updated logging to include request IDs for better traceability during debugging.
- Refactored query methods to incorporate the new retry logic and improved debug logging for API interactions.
  • Loading branch information
tphakala committed Dec 20, 2024
1 parent 5323e45 commit c3e6948
Showing 1 changed file with 101 additions and 30 deletions.
131 changes: 101 additions & 30 deletions internal/imageprovider/wikipedia.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,27 @@ package imageprovider

import (
"bytes"
"context"
"fmt"
"log"
"strings"
"time"

"cgt.name/pkg/go-mwclient"
"github.com/antonholmquist/jason"
"github.com/google/uuid"
"github.com/k3a/html2text"
"github.com/tphakala/birdnet-go/internal/conf"
"golang.org/x/net/html"
"golang.org/x/time/rate"
)

// wikiMediaProvider implements the ImageProvider interface for Wikipedia.
type wikiMediaProvider struct {
client *mwclient.Client
debug bool
client *mwclient.Client
debug bool
limiter *rate.Limiter
maxRetries int
}

// wikiMediaAuthor represents the author information for a Wikipedia image.
Expand All @@ -36,78 +42,130 @@ func NewWikiMediaProvider() (*wikiMediaProvider, error) {
if err != nil {
return nil, fmt.Errorf("failed to create mwclient: %w", err)
}

// Rate limit: 10 requests per second with burst of 10
return &wikiMediaProvider{
client: client,
debug: settings.Realtime.Dashboard.Thumbnails.Debug,
client: client,
debug: settings.Realtime.Dashboard.Thumbnails.Debug,
limiter: rate.NewLimiter(rate.Limit(10), 10),
maxRetries: 3,
}, nil
}

// queryWithRetry performs a query with retry logic.
// It waits for rate limiter, retries on error, and waits before retrying.
func (l *wikiMediaProvider) queryWithRetry(reqID string, params map[string]string) (*jason.Object, error) {
var lastErr error
for attempt := 0; attempt < l.maxRetries; attempt++ {
if l.debug {
log.Printf("[%s] Debug: API request attempt %d", reqID, attempt+1)
}
// Wait for rate limiter
err := l.limiter.Wait(context.Background())
if err != nil {
return nil, fmt.Errorf("rate limiter error: %w", err)
}

resp, err := l.client.Get(params)
if err == nil {
return resp, nil
}

lastErr = err
if l.debug {
log.Printf("Debug: API request attempt %d failed: %v", attempt+1, err)
}

// Wait before retry (exponential backoff)
time.Sleep(time.Second * time.Duration(1<<attempt))
}

return nil, fmt.Errorf("all %d attempts failed, last error: %w", l.maxRetries, lastErr)
}

// queryAndGetFirstPage queries Wikipedia with given parameters and returns the first page hit.
// It handles the API request and response parsing.
func (l *wikiMediaProvider) queryAndGetFirstPage(params map[string]string) (*jason.Object, error) {
func (l *wikiMediaProvider) queryAndGetFirstPage(reqID string, params map[string]string) (*jason.Object, error) {
if l.debug {
log.Printf("Debug: Querying Wikipedia API with params: %v", params)
log.Printf("[%s] Debug: Querying Wikipedia API with params: %v", reqID, params)
}

resp, err := l.client.Get(params)
resp, err := l.queryWithRetry(reqID, params)
if err != nil {
if l.debug {
log.Printf("Debug: Wikipedia API query failed: %v", err)
log.Printf("Debug: Wikipedia API query failed after retries: %v", err)
}
return nil, fmt.Errorf("failed to query Wikipedia: %w", err)
}

if l.debug {
if obj, err := resp.Object(); err == nil {
log.Printf("[%s] Debug: Raw Wikipedia API response: %v", reqID, obj)
}
}

pages, err := resp.GetObjectArray("query", "pages")
if err != nil {
if l.debug {
log.Printf("Debug: Failed to parse Wikipedia response pages: %v", err)
log.Printf("[%s] Debug: Failed to parse Wikipedia response pages: %v", reqID, err)
if obj, err := resp.Object(); err == nil {
log.Printf("[%s] Debug: Response structure: %v", reqID, obj)
}
}
return nil, fmt.Errorf("failed to get pages from response: %w", err)
}

if l.debug {
if firstPage, err := pages[0].Object(); err == nil {
log.Printf("[%s] Debug: First page content: %v", reqID, firstPage)
log.Printf("[%s] Debug: Successfully retrieved Wikipedia page", reqID)
}
}

if len(pages) == 0 {
if l.debug {
log.Printf("Debug: No pages found in Wikipedia response for params: %v", params)
if obj, err := resp.Object(); err == nil {
log.Printf("Debug: Full response structure: %v", obj)
}
}
return nil, fmt.Errorf("no pages found for request: %v", params)
}

if l.debug {
log.Printf("Debug: Successfully retrieved Wikipedia page")
}
return pages[0], nil
}

// fetch retrieves the bird image for a given scientific name.
// It queries for the thumbnail and author information, then constructs a BirdImage.
func (l *wikiMediaProvider) Fetch(scientificName string) (BirdImage, error) {
reqID := uuid.New().String()[:8] // Using first 8 chars for brevity
if l.debug {
log.Printf("Debug: Starting Wikipedia fetch for species: %s", scientificName)
log.Printf("[%s] Debug: Starting Wikipedia fetch for species: %s", reqID, scientificName)
}

thumbnailURL, thumbnailSourceFile, err := l.queryThumbnail(scientificName)
thumbnailURL, thumbnailSourceFile, err := l.queryThumbnail(reqID, scientificName)
if err != nil {
if l.debug {
log.Printf("Debug: Failed to fetch thumbnail for %s: %v", scientificName, err)
log.Printf("[%s] Debug: Failed to fetch thumbnail for %s: %v", reqID, scientificName, err)
}
return BirdImage{}, fmt.Errorf("failed to query thumbnail of bird: %s : %w", scientificName, err)
}

if l.debug {
log.Printf("Debug: Successfully retrieved thumbnail URL: %s", thumbnailURL)
log.Printf("Debug: Thumbnail source file: %s", thumbnailSourceFile)
log.Printf("[%s] Debug: Successfully retrieved thumbnail - URL: %s, File: %s", reqID, thumbnailURL, thumbnailSourceFile)
log.Printf("[%s] Debug: Thumbnail source file: %s", reqID, thumbnailSourceFile)
}

authorInfo, err := l.queryAuthorInfo(thumbnailSourceFile)
authorInfo, err := l.queryAuthorInfo(reqID, thumbnailSourceFile)
if err != nil {
if l.debug {
log.Printf("Debug: Failed to fetch author info for %s: %v", scientificName, err)
log.Printf("[%s] Debug: Failed to fetch author info for %s: %v", reqID, scientificName, err)
}
return BirdImage{}, fmt.Errorf("failed to query thumbnail credit of bird: %s : %w", scientificName, err)
}

if l.debug {
log.Printf("Debug: Successfully retrieved author info for %s - Author: %s", scientificName, authorInfo.name)
log.Printf("[%s] Debug: Successfully retrieved author info for %s - Author: %s", reqID, scientificName, authorInfo.name)
}

return BirdImage{
Expand All @@ -121,9 +179,9 @@ func (l *wikiMediaProvider) Fetch(scientificName string) (BirdImage, error) {

// queryThumbnail queries Wikipedia for the thumbnail image of the given scientific name.
// It returns the URL and file name of the thumbnail.
func (l *wikiMediaProvider) queryThumbnail(scientificName string) (url, fileName string, err error) {
func (l *wikiMediaProvider) queryThumbnail(reqID, scientificName string) (url, fileName string, err error) {
if l.debug {
log.Printf("Debug: Querying thumbnail for species: %s", scientificName)
log.Printf("[%s] Debug: Querying thumbnail for species: %s", reqID, scientificName)
}

params := map[string]string{
Expand All @@ -136,7 +194,7 @@ func (l *wikiMediaProvider) queryThumbnail(scientificName string) (url, fileName
"redirects": "",
}

page, err := l.queryAndGetFirstPage(params)
page, err := l.queryAndGetFirstPage(reqID, params)
if err != nil {
if l.debug {
log.Printf("Debug: Failed to query thumbnail page: %v", err)
Expand All @@ -161,17 +219,19 @@ func (l *wikiMediaProvider) queryThumbnail(scientificName string) (url, fileName
}

if l.debug {
log.Printf("Debug: Successfully retrieved thumbnail - URL: %s, File: %s", url, fileName)
log.Printf("[%s] Debug: Successfully retrieved thumbnail - URL: %s, File: %s", reqID, url, fileName)
log.Printf("[%s] Debug: Successfully retrieved thumbnail URL: %s", reqID, url)
log.Printf("[%s] Debug: Thumbnail source file: %s", reqID, fileName)
}

return url, fileName, nil
}

// queryAuthorInfo queries Wikipedia for the author information of the given thumbnail URL.
// It returns a wikiMediaAuthor struct containing the author and license information.
func (l *wikiMediaProvider) queryAuthorInfo(thumbnailURL string) (*wikiMediaAuthor, error) {
func (l *wikiMediaProvider) queryAuthorInfo(reqID, thumbnailURL string) (*wikiMediaAuthor, error) {
if l.debug {
log.Printf("Debug: Querying author info for thumbnail: %s", thumbnailURL)
log.Printf("[%s] Debug: Querying author info for thumbnail: %s", reqID, thumbnailURL)
}

params := map[string]string{
Expand All @@ -182,7 +242,7 @@ func (l *wikiMediaProvider) queryAuthorInfo(thumbnailURL string) (*wikiMediaAuth
"redirects": "",
}

page, err := l.queryAndGetFirstPage(params)
page, err := l.queryAndGetFirstPage(reqID, params)
if err != nil {
if l.debug {
log.Printf("Debug: Failed to query author info page: %v", err)
Expand All @@ -191,13 +251,18 @@ func (l *wikiMediaProvider) queryAuthorInfo(thumbnailURL string) (*wikiMediaAuth
}

if l.debug {
log.Printf("Debug: Processing image info response")
if obj, err := page.Object(); err == nil {
log.Printf("Debug: Processing image info response: %v", obj)
}
}

imageInfo, err := page.GetObjectArray("imageinfo")
if err != nil {
if l.debug {
log.Printf("Debug: Failed to extract image info: %v", err)
if obj, err := page.Object(); err == nil {
log.Printf("Debug: Page content: %v", obj)
}
}
return nil, fmt.Errorf("failed to get image info from response: %w", err)
}
Expand Down Expand Up @@ -237,7 +302,7 @@ func (l *wikiMediaProvider) queryAuthorInfo(thumbnailURL string) (*wikiMediaAuth
}

if l.debug {
log.Printf("Debug: Successfully extracted author info - Name: %s, URL: %s", text, href)
log.Printf("[%s] Debug: Successfully extracted author info - Name: %s, URL: %s", reqID, text, href)
}

return &wikiMediaAuthor{
Expand All @@ -251,6 +316,12 @@ func (l *wikiMediaProvider) queryAuthorInfo(thumbnailURL string) (*wikiMediaAuth
// extractArtistInfo tries to extract the author information from the given HTML string.
// It parses the HTML and attempts to find the most relevant link and text.
func extractArtistInfo(htmlStr string) (href, text string, err error) {
// First check if the string contains any HTML-like content
if !strings.Contains(htmlStr, "<") {
// If it's plain text, return it as the text with empty href
return "", strings.TrimSpace(htmlStr), nil
}

doc, err := html.Parse(strings.NewReader(htmlStr))
if err != nil {
return "", "", err
Expand All @@ -259,7 +330,7 @@ func extractArtistInfo(htmlStr string) (href, text string, err error) {
links := findLinks(doc)

if len(links) == 0 {
return "", html2text.HTML2Text(htmlStr), nil
return "", "", fmt.Errorf("failed to extract link from HTML: %s", htmlStr)
}

if len(links) == 1 {
Expand Down

0 comments on commit c3e6948

Please sign in to comment.