Skip to content

Index general search in Elasticsearch #48

Index general search in Elasticsearch

Index general search in Elasticsearch #48

name: Index general search in Elasticsearch
# **What it does**: It scrapes the whole site and dumps the records in a
# temp directory. Then it indexes that into Elasticsearch.
# **Why we have it**: We want our search indexes kept up to date.
# **Who does it impact**: Anyone using search on docs.
on:
workflow_dispatch:
inputs:
version:
description: "Version to exclusively generate the search index for. E.g. 'dotcom', 'ghes-3.12'"
required: false
default: ''
languages:
description: "Comma separated languages. E.g. 'en,ja, es' (defaults to all)"
required: false
default: ''
schedule:
- cron: '20 16 * * *' # Run every 24 hours at 20 minutes past the hour
workflow_run:
workflows: ['Purge Fastly']
types:
- completed
permissions:
contents: read
# This allows a subsequently queued workflow run to cancel previous runs
concurrency:
group: '${{ github.workflow }} @ ${{ github.head_ref }} ${{ github.event_name }}'
cancel-in-progress: true
env:
ELASTICSEARCH_URL: ${{ secrets.ELASTICSEARCH_URL }}
# Since we'll run in NODE_ENV=production, we need to be explicit that
# we don't want Hydro configured.
HYDRO_ENDPOINT: ''
HYDRO_SECRET: ''
jobs:
figureOutMatrix:
if: ${{ github.repository == 'github/docs-internal' }}
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.result }}
steps:
- uses: actions/github-script@e69ef5462fd455e02edcaf4dd7708eda96b9eda0 # v7.0.0
id: set-matrix
with:
script: |
// Edit this list for the definitive list of languages
// (other than English) we want to index in Elasticsearch.
const allNonEnglish = ["zh", "es", "pt", "ru", "ja", "fr", "de", "ko"]
const allPossible = ["en", ...allNonEnglish]
if (context.eventName === "workflow_run") {
if (context.payload.workflow_run.conclusion === "success") {
return ["en"]
}
console.warn(`NOTE! It was a workflow_run but not success ('${context.payload.workflow_run.conclusion}')`)
console.warn("This means we're not going to index anything in the next dependent step.")
return []
}
if (context.eventName === "workflow_dispatch") {
if (context.payload.inputs.languages) {
const clean = context.payload.inputs.languages.split(',').map(x => x.trim()).filter(Boolean)
const notRecognized = clean.find(x => !allPossible.includes(x))
if (notRecognized) {
throw new Error(`'${notRecognized}' is not a recognized language code`)
}
return clean
}
return allPossible
}
if (context.eventName === "schedule") {
return allNonEnglish
}
console.log(context)
throw new Error(`Unable figure out what languages to run (${context.eventName})`)
- name: Debug output
run: echo "${{ steps.set-matrix.outputs.result }}"
- name: Check out repo
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- uses: ./.github/actions/slack-alert
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
with:
slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
updateElasticsearchIndexes:
needs: figureOutMatrix
name: Update indexes
if: ${{ github.repository == 'github/docs-internal' && needs.figureOutMatrix.outputs.matrix != '[]' }}
runs-on: ubuntu-20.04-xl
strategy:
fail-fast: false
# When it's only English (i.e. a simple array of ['en']), this value
# does not matter. If it's ALL the languages, then we know we can
# be patient because it's a daily scheduled run and it's run by bots
# while humans are asleep. So there's no rush and no need to finish
# the whole job fast.
# As of June 2023, it takes about 10+ minutes to index one whole
# language and we have 8 non-English languages.
max-parallel: 3
matrix:
language: ${{ fromJSON(needs.figureOutMatrix.outputs.matrix) }}
steps:
- name: Check out repo
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- name: Clone docs-internal-data
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
repository: github/docs-internal-data
# This works because user `docs-bot` has read access to that private repo.
token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }}
path: docs-internal-data
- name: Clone all translations
if: ${{ matrix.language != 'en' }}
uses: ./.github/actions/clone-translations
with:
token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }}
- uses: ./.github/actions/node-npm-setup
- uses: ./.github/actions/cache-nextjs
- name: Run build scripts
run: npm run build
- name: Start the server in the background
env:
ENABLE_DEV_LOGGING: false
run: |
npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log &
# first sleep to give it a chance to start
sleep 6
curl --retry-connrefused --retry 4 -I http://localhost:4002/
- if: ${{ failure() }}
name: Debug server outputs on errors
run: |
echo "____STDOUT____"
cat /tmp/stdout.log
echo "____STDERR____"
cat /tmp/stderr.log
- name: Scrape records into a temp directory
env:
# If a reusable, or anything in the `data/*` directory is deleted
# you might get a
#
# RenderError: Can't find the key 'site.data.reusables...' in the scope
#
# But that'll get fixed in the next translation pipeline. For now,
# let's just accept an empty string instead.
THROW_ON_EMPTY: false
# Note that by default, this is '' (empty string) and that means
# the same as not set within the script.
VERSION: ${{ inputs.version }}
DOCS_INTERNAL_DATA: docs-internal-data
run: |
mkdir /tmp/records
npm run general-search-scrape -- /tmp/records \
--language ${{ matrix.language }}
ls -lh /tmp/records
- name: Check that Elasticsearch is accessible
run: |
curl --fail --retry-connrefused --retry 5 -I ${{ env.ELASTICSEARCH_URL }}
- name: Index into Elasticsearch
env:
# Must match what we used when scraping (npm run general-search-scrape)
# otherwise the script will seek other versions from disk that might
# not exist.
VERSION: ${{ inputs.version }}
run: |
npm run index-general-search -- /tmp/records \
--language ${{ matrix.language }} \
--stagger-seconds 5 \
--retries 5
- name: Check created indexes and aliases
run: |
# Not using `--fail` here because I've observed that it can fail
# with a rather cryptic 404 error when it should, if anything, be
# a 200 OK with a list of no indices.
curl --retry-connrefused --retry 5 ${{ env.ELASTICSEARCH_URL }}/_cat/indices?v
curl --retry-connrefused --retry 5 ${{ env.ELASTICSEARCH_URL }}/_cat/indices?v
- name: Purge Fastly edge cache
env:
FASTLY_TOKEN: ${{ secrets.FASTLY_TOKEN }}
FASTLY_SERVICE_ID: ${{ secrets.FASTLY_SERVICE_ID }}
FASTLY_SURROGATE_KEY: api-search:${{ matrix.language }}
run: npm run purge-fastly-edge-cache
- uses: ./.github/actions/slack-alert
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
with:
slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}