From bd36dff7b2919d44c1a6464351eb00a29caa725b Mon Sep 17 00:00:00 2001 From: Marlon Saglia Date: Tue, 27 Aug 2024 14:47:59 +0200 Subject: [PATCH] feat: Add Jekyll-based build for Vespa Search This commit introduces a new GitHub Action for building Jekyll-based content compatible with Vespa Search. The changes include: - Added the `jekyll-build-json/action.yml` file, which defines the inputs and configuration for the action. - Added the `jekyll-build-json/Dockerfile` file, which sets up the build environment with the necessary dependencies. - Added the `jekyll-build-json/Gemfile` file, which specifies the required Ruby gems for the Jekyll build. - Added the `jekyll-build-json/entrypoint.sh` file, which is the entry point for the Docker container. - Added the `docker-publish.yml` workflow file, which automates the build and publication of the Docker image for the action. These changes enable the creation of Jekyll-based content that can be easily consumed by the Vespa Search platform, improving the developer experience and simplifying the integration process. feat: Add Jekyll to Vespa feed workflow --- .github/workflows/docker-publish.yml | 92 +++++++++++ .github/workflows/jekyll-feed-to-vespa.yml | 25 +++ jekyll-build-json/Dockerfile | 32 ++++ jekyll-build-json/Gemfile | 17 ++ jekyll-build-json/action.yml | 6 + jekyll-build-json/entrypoint.sh | 32 ++++ jekyll-build-json/vespa_index_generator.rb | 171 +++++++++++++++++++++ 7 files changed, 375 insertions(+) create mode 100644 .github/workflows/docker-publish.yml create mode 100644 .github/workflows/jekyll-feed-to-vespa.yml create mode 100644 jekyll-build-json/Dockerfile create mode 100644 jekyll-build-json/Gemfile create mode 100644 jekyll-build-json/action.yml create mode 100755 jekyll-build-json/entrypoint.sh create mode 100644 jekyll-build-json/vespa_index_generator.rb diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 0000000..066628d --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,92 @@ +name: Docker + +# This workflow will generate container images for each of the Github Actions +# which are using a Dockerfile. + +on: + push: + branches: + - "main" + - "feature-*" + tags: + - "v*" + +jobs: + prepare: + # Finds all the Dockerfiles in the repository and sets the action names + # as output to reuse as matrix strategy in the build job + runs-on: ubuntu-latest + + outputs: + matrix_json: ${{ steps.set-output.outputs.matrix_json }} + + steps: + - uses: actions/checkout@v4 + + - name: Find Dockerfiles + id: set-output + run: | + # Find all Dockerfiles in the repository + paths_with_docker=$(find -type f -name 'Dockerfile' | sed 's|/Dockerfile||' | sed 's|./||') + # Transform the list of paths into a valid JSON array to be used as matrix strategy + action_names=$(echo $paths_with_docker | jq --raw-input --compact-output 'split(" ") | { "action-name": . }') + + echo "Found Dockerfiles in the repository:" + echo "${action_names}" + + echo "matrix_json=${action_names}" >> "${GITHUB_OUTPUT}" + + build: + runs-on: ubuntu-latest + + needs: + - prepare + + permissions: + contents: read + packages: write + + strategy: + matrix: ${{ fromJson(needs.prepare.outputs.matrix_json) }} + + env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }}/${{ matrix.action-name }} + + steps: + - uses: actions/checkout@v4 + + - name: Log into registry ${{ env.REGISTRY }} + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Extract Docker metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=ref,event=pr + type=ref,event=branch + type=semver,pattern={{version}} + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: ${{ matrix.action-name }} + file: ${{ matrix.action-name }}/Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + # platforms: linux/amd64,linux/arm64 # Enable if we ever need to build for ARM + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/.github/workflows/jekyll-feed-to-vespa.yml b/.github/workflows/jekyll-feed-to-vespa.yml new file mode 100644 index 0000000..ab228e4 --- /dev/null +++ b/.github/workflows/jekyll-feed-to-vespa.yml @@ -0,0 +1,25 @@ +--- +name: Feed Jekyll posts to Vespa + +on: + workflow_call: + +defaults: + run: + # Specify to ensure "pipefail and errexit" are set. + # Ref: https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#defaultsrunshell + shell: bash + +jobs: + lint: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Build with Jekyll + uses: vespa-engine/gh-actions/jekyll-build-json@feature-jekyll-build-json-action + + - name: List JSON + run: | + find _site -type f -name '*.json' -exec jq empty {} \; diff --git a/jekyll-build-json/Dockerfile b/jekyll-build-json/Dockerfile new file mode 100644 index 0000000..2c1cbc7 --- /dev/null +++ b/jekyll-build-json/Dockerfile @@ -0,0 +1,32 @@ +ARG RUBY_VERSION=3.3 +FROM ruby:$RUBY_VERSION-slim + +RUN apt-get update \ + && apt-get install -y \ + build-essential \ + git \ + locales + +WORKDIR /github/workspace + +COPY Gemfile Gemfile + +ENV NOKOGIRI_USE_SYSTEM_LIBRARIES=true +RUN gem install bundler && \ + bundle config set path.system true && \ + bundle install --jobs 20 --retry 5 --quiet + +RUN \ + echo "en_US UTF-8" > /etc/locale.gen && \ + locale-gen en-US.UTF-8 +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US.UTF-8 +ENV LC_ALL=en_US.UTF-8 + +COPY entrypoint.sh /entrypoint.sh + +RUN mkdir -p /opt/jekyll/plugins + +COPY vespa_index_generator.rb /opt/jekyll/plugins/vespa_index_generator.rb + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/jekyll-build-json/Gemfile b/jekyll-build-json/Gemfile new file mode 100644 index 0000000..26e7462 --- /dev/null +++ b/jekyll-build-json/Gemfile @@ -0,0 +1,17 @@ +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +# frozen_string_literal: true + +source "https://rubygems.org" + +# Manage our dependency on the version of the github-pages gem here. +gem "github-pages", "= 232" + +# Explicitly include this gem here. +# It is not directly included in the github-pages gem list of dependencies, +# even though it is included in the original GitHub Pages build infrastructure. +gem "jekyll-include-cache", "= 0.2.1" +gem "jekyll-octicons", "~> 14.2" +gem "minima", "~> 2.0" + +# Work-around for webrick no longer included in Ruby 3.0 (https://github.com/jekyll/jekyll/issues/8523) +gem "webrick" diff --git a/jekyll-build-json/action.yml b/jekyll-build-json/action.yml new file mode 100644 index 0000000..efd3d94 --- /dev/null +++ b/jekyll-build-json/action.yml @@ -0,0 +1,6 @@ +name: 'Build Jekyll for Vespa Search' +description: 'A simple GH Action for producing Jekyll build artifacts compatible with Vespa JSON' +author: 'Vespa.ai' +runs: + using: 'docker' + image: 'docker://ghcr.io/vespa-engine/gh-actions/jekyll-build-json:feature-jekyll-build-json-action' diff --git a/jekyll-build-json/entrypoint.sh b/jekyll-build-json/entrypoint.sh new file mode 100755 index 0000000..0438b99 --- /dev/null +++ b/jekyll-build-json/entrypoint.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e +set -o pipefail +set -o nounset + +SOURCE_DIRECTORY=${GITHUB_WORKSPACE}/ + +# Set environment variables required by supported plugins +export JEKYLL_ENV="production" + +gem env + +bundle config set path.system true + +# Run the command, capturing the output +build_output="$(bundle exec jekyll build -p /opt/jekyll/plugins)" + +# Capture the exit code +exit_code=$? + +if [ $exit_code -ne 0 ]; then + # Remove the newlines from the build_output as annotation not support multiline + error=$(echo "$build_output" | tr '\n' ' ' | tr -s ' ') + echo "::error::$error" +else + # Display the build_output directly + echo "$build_output" +fi + +# Exit with the captured exit code +exit $exit_code diff --git a/jekyll-build-json/vespa_index_generator.rb b/jekyll-build-json/vespa_index_generator.rb new file mode 100644 index 0000000..98a0dda --- /dev/null +++ b/jekyll-build-json/vespa_index_generator.rb @@ -0,0 +1,171 @@ +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +require 'json' +require 'nokogiri' +require 'kramdown/parser/kramdown' + +module Jekyll + + class VespaIndexGenerator < Jekyll::Generator + priority :lowest + safe true + + def generate(site) + puts "::debug::VespaIndexGenerator" + raise "Wrong parameter type, expected Jekyll::Site, got #{site.class}" unless site.is_a?(Jekyll::Site) + raise "Search configuration not found" unless site.config["search"] + + search_config = site.config["search"] + namespace = search_config["namespace"] + operations = [] + if site.pages.empty? + # Drop out with an error + puts "::error::No pages found!" + return nil + end + + puts "::debug::Pages found: #{site.pages.size}" + site.pages.each do |page| + if should_skip?(search_config, page) || page.data["index"] != true + puts "::debug::Page not indexed: #{page.url}, index flag: #{page.data['index']}" + next + end + + puts "::debug::Processing page: #{page.url}" + url = page.url + url += 'index.html' if url[-1, 1] == '/' + + text = extract_text(page) + outlinks = extract_links(page) + headers = extract_headers(page) + keywords = get_keywords(page) + + fields = { + :path => url, + :namespace => namespace, + :title => page.data["title"], + :content => text, + :html => get_html(page), + :term_count => text.split.length(), + :last_updated => Time.now.to_i + } + fields[:outlinks] = outlinks if !outlinks.empty? + fields[:headers] = headers if !headers.empty? + fields[:keywords] = keywords if !keywords.empty? + operations.push({ + :put => "id:" + namespace + ":doc::" + namespace + url, + :fields => fields + }) + end + json = JSON.pretty_generate(operations) + puts "::debug::Writing index file: #{namespace}_index.json" + File.open(namespace + "_index.json", "w") { |f| f.write(json) } + end + + def should_skip?(search_config, page) + exclude_pages = search_config["exclude_pages"] || [] + return exclude_pages.include?(page) || is_empty(page) + end + + def is_empty(page) + # The generated client-side redirects should not be indexed - + # they have no title and node content + return page.content == "" && !page.data["title"] + end + + def get_html(page) + if page.name[page.name.rindex('.')+1..-1] == "md" + doc = Kramdown::Document.new(page.content).to_html + else + doc = page.content + end + end + + def get_doc(page) + if page.name[page.name.rindex('.')+1..-1] == "md" + doc = Nokogiri::HTML(Kramdown::Document.new(page.content).to_html) + else + doc = Nokogiri::HTML(page.content) + end + end + + def reset_xml_pre(doc) + # The highlighter works on un-quoted XML, so some docs have non-HTML elements like + # Read and set such fields again for proper quoting and later text extraction (dirty hack ...) + doc.search('pre').each do |pre| + if pre.to_s =~ /\{% highlight xml %}/ + pre.content = pre.to_s.gsub("\n", " ") + .gsub(/
\s*\{% highlight xml %}(.+?)\{% endhighlight %}<\/pre>/, '\1')
+                end
+            end
+            return doc
+        end
+
+        def extract_text(page)
+            doc = reset_xml_pre(get_doc(page))
+            doc.search('th,td').each{ |e| e.after "\n" }
+            doc.search('style').each{ |e| e.remove }
+            content = doc.xpath("//text()").to_s
+                .gsub("\r"," ")
+                .gsub("\n"," ")
+            return strip_liquid(content)
+        end
+
+        def extract_links(page)
+            doc = get_doc(page)
+            links = doc.css('a').map { |link| link['href'] || ""}
+            links.reject{ |l| l.empty? }.map{ |l| l }
+            return links
+        end
+
+        def extract_headers(page)
+            doc = get_doc(page)
+            headers = doc.css('h1,h2,h3,h4').map { |header| header.content.gsub("\r"," ").gsub("\n"," ") || ""}
+            headers.reject{ |h| h.empty? }.map{ |h| h }
+            return headers
+        end
+
+        def get_keywords(page)
+            doc = get_doc(page)
+            keywords = []
+            if page.data["keywords"]
+                page.data["keywords"].split(/,/).each do |k|
+                    k = k.strip
+                    keywords.push(k) if ! k.empty?
+                end
+            end
+            return keywords
+        end
+
+        def strip_liquid(text)
+            return text.gsub(/\{%(.+?)%}/) { "#{ process_liquid($1) }" } # .+? is a lazy match, match only once
+        end
+
+        def process_liquid(match)
+        # https://ruby-doc.org/core-3.1.2/Regexp.html for the quotes
+        # ToDo: define the quote pattern (\"|\p{Pi}|\p{Pf}|') once and build regex using this as a parameter
+        #
+        # This is a poor man's solution to clean the data for search -
+        # the alternative is building the site and _then_ extract data
+        # That will however add jekyll build as a dependency for feeding, so keeping this simple for now
+            return match.gsub(/^\s*highlight\s*\w*/, "")
+                     .gsub(/^\s*(raw|endraw|endhighlight)/, "")
+                     .gsub(/^\s*include\s*(deprecated|important|note|query|warning).html\s*content=\s*(\"|\p{Pi}|\p{Pf}|')/, "")
+                     .gsub(/^\s*include\s*video-include.html\s.*video-title=\s*(\"|\p{Pi}|\p{Pf}|')/, "Find at vespa.ai/resources: ")
+                     .gsub(/^\s*include\s*pre-req.html\s*memory=\s*(\"|\p{Pi}|\p{Pf}|')(.*)/)  { "#{ process_pre_req($2) }" }
+                     .gsub(/(\"|\p{Pi}|\p{Pf}|')\s*$/, "")
+        end
+
+        def process_pre_req(match)
+            return match.gsub(/([0-9]*)\s*GB/, '
+                Docker: Docker Desktop for Mac/Windows, or Docker on Linux.
+                Operating system: Linux, macOS or Windows 10 Pro.
+                Architecture: x86_64 or arm64.
+                Minimum \1 GB RAM dedicated to Docker (the default is 2 GB on macOS). Memory recommendations.
+                Homebrew to install the Vespa CLI, or download Vespa CLI from Github releases.')
+                .gsub(/(\"|\p{Pi}|\p{Pf}|')\s*extra-reqs=\s*(\"|\p{Pi}|\p{Pf}|')/, "")
+        end
+
+    end
+
+end