From bd36dff7b2919d44c1a6464351eb00a29caa725b Mon Sep 17 00:00:00 2001
From: Marlon Saglia <marlon@vespa.ai>
Date: Tue, 27 Aug 2024 14:47:59 +0200
Subject: [PATCH] feat: Add Jekyll-based build for Vespa Search

This commit introduces a new GitHub Action for building Jekyll-based
content compatible with Vespa Search. The changes include:

- Added the `jekyll-build-json/action.yml` file, which defines the
  inputs and configuration for the action.
- Added the `jekyll-build-json/Dockerfile` file, which sets up the
  build environment with the necessary dependencies.
- Added the `jekyll-build-json/Gemfile` file, which specifies the
  required Ruby gems for the Jekyll build.
- Added the `jekyll-build-json/entrypoint.sh` file, which is the
  entry point for the Docker container.
- Added the `docker-publish.yml` workflow file, which automates the
  build and publication of the Docker image for the action.

These changes enable the creation of Jekyll-based content that can be
easily consumed by the Vespa Search platform, improving the developer
experience and simplifying the integration process.
feat: Add Jekyll to Vespa feed workflow
---
 .github/workflows/docker-publish.yml       |  92 +++++++++++
 .github/workflows/jekyll-feed-to-vespa.yml |  25 +++
 jekyll-build-json/Dockerfile               |  32 ++++
 jekyll-build-json/Gemfile                  |  17 ++
 jekyll-build-json/action.yml               |   6 +
 jekyll-build-json/entrypoint.sh            |  32 ++++
 jekyll-build-json/vespa_index_generator.rb | 171 +++++++++++++++++++++
 7 files changed, 375 insertions(+)
 create mode 100644 .github/workflows/docker-publish.yml
 create mode 100644 .github/workflows/jekyll-feed-to-vespa.yml
 create mode 100644 jekyll-build-json/Dockerfile
 create mode 100644 jekyll-build-json/Gemfile
 create mode 100644 jekyll-build-json/action.yml
 create mode 100755 jekyll-build-json/entrypoint.sh
 create mode 100644 jekyll-build-json/vespa_index_generator.rb

diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
new file mode 100644
index 0000000..066628d
--- /dev/null
+++ b/.github/workflows/docker-publish.yml
@@ -0,0 +1,92 @@
+name: Docker
+
+# This workflow will generate container images for each of the Github Actions
+# which are using a Dockerfile.
+
+on:
+  push:
+    branches:
+      - "main"
+      - "feature-*"
+    tags:
+      - "v*"
+
+jobs:
+  prepare:
+    # Finds all the Dockerfiles in the repository and sets the action names
+    # as output to reuse as matrix strategy in the build job
+    runs-on: ubuntu-latest
+
+    outputs:
+      matrix_json: ${{ steps.set-output.outputs.matrix_json }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Find Dockerfiles
+        id: set-output
+        run: |
+          # Find all Dockerfiles in the repository
+          paths_with_docker=$(find -type f -name 'Dockerfile' | sed 's|/Dockerfile||' | sed 's|./||')
+          # Transform the list of paths into a valid JSON array to be used as matrix strategy
+          action_names=$(echo $paths_with_docker | jq --raw-input --compact-output 'split(" ") | { "action-name": . }')
+
+          echo "Found Dockerfiles in the repository:"
+          echo "${action_names}"
+
+          echo "matrix_json=${action_names}" >> "${GITHUB_OUTPUT}"
+
+  build:
+    runs-on: ubuntu-latest
+
+    needs:
+      - prepare
+
+    permissions:
+      contents: read
+      packages: write
+
+    strategy:
+      matrix: ${{ fromJson(needs.prepare.outputs.matrix_json) }}
+
+    env:
+      REGISTRY: ghcr.io
+      IMAGE_NAME: ${{ github.repository }}/${{ matrix.action-name }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Log into registry ${{ env.REGISTRY }}
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Extract Docker metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=ref,event=pr
+            type=ref,event=branch
+            type=semver,pattern={{version}}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v6
+        with:
+          context: ${{ matrix.action-name }}
+          file: ${{ matrix.action-name }}/Dockerfile
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          # platforms: linux/amd64,linux/arm64 # Enable if we ever need to build for ARM
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/.github/workflows/jekyll-feed-to-vespa.yml b/.github/workflows/jekyll-feed-to-vespa.yml
new file mode 100644
index 0000000..ab228e4
--- /dev/null
+++ b/.github/workflows/jekyll-feed-to-vespa.yml
@@ -0,0 +1,25 @@
+---
+name: Feed Jekyll posts to Vespa
+
+on:
+  workflow_call:
+
+defaults:
+  run:
+    # Specify to ensure "pipefail and errexit" are set.
+    # Ref: https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#defaultsrunshell
+    shell: bash
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build with Jekyll
+        uses: vespa-engine/gh-actions/jekyll-build-json@feature-jekyll-build-json-action
+
+      - name: List JSON
+        run: |
+          find _site -type f -name '*.json' -exec jq empty {} \;
diff --git a/jekyll-build-json/Dockerfile b/jekyll-build-json/Dockerfile
new file mode 100644
index 0000000..2c1cbc7
--- /dev/null
+++ b/jekyll-build-json/Dockerfile
@@ -0,0 +1,32 @@
+ARG RUBY_VERSION=3.3
+FROM ruby:$RUBY_VERSION-slim
+
+RUN apt-get update \
+  && apt-get install -y \
+  build-essential \
+  git \
+  locales
+
+WORKDIR /github/workspace
+
+COPY Gemfile Gemfile
+
+ENV NOKOGIRI_USE_SYSTEM_LIBRARIES=true
+RUN gem install bundler && \
+  bundle config set path.system true && \
+  bundle install --jobs 20 --retry 5 --quiet
+
+RUN \
+  echo "en_US UTF-8" > /etc/locale.gen && \
+  locale-gen en-US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+ENV LC_ALL=en_US.UTF-8
+
+COPY entrypoint.sh /entrypoint.sh
+
+RUN mkdir -p /opt/jekyll/plugins
+
+COPY vespa_index_generator.rb /opt/jekyll/plugins/vespa_index_generator.rb
+
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/jekyll-build-json/Gemfile b/jekyll-build-json/Gemfile
new file mode 100644
index 0000000..26e7462
--- /dev/null
+++ b/jekyll-build-json/Gemfile
@@ -0,0 +1,17 @@
+# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+# frozen_string_literal: true
+
+source "https://rubygems.org"
+
+# Manage our dependency on the version of the github-pages gem here.
+gem "github-pages", "= 232"
+
+# Explicitly include this gem here.
+# It is not directly included in the github-pages gem list of dependencies,
+# even though it is included in the original GitHub Pages build infrastructure.
+gem "jekyll-include-cache", "= 0.2.1"
+gem "jekyll-octicons", "~> 14.2"
+gem "minima", "~> 2.0"
+
+# Work-around for webrick no longer included in Ruby 3.0 (https://github.com/jekyll/jekyll/issues/8523)
+gem "webrick"
diff --git a/jekyll-build-json/action.yml b/jekyll-build-json/action.yml
new file mode 100644
index 0000000..efd3d94
--- /dev/null
+++ b/jekyll-build-json/action.yml
@@ -0,0 +1,6 @@
+name: 'Build Jekyll for Vespa Search'
+description: 'A simple GH Action for producing Jekyll build artifacts compatible with Vespa JSON'
+author: 'Vespa.ai'
+runs:
+  using: 'docker'
+  image: 'docker://ghcr.io/vespa-engine/gh-actions/jekyll-build-json:feature-jekyll-build-json-action'
diff --git a/jekyll-build-json/entrypoint.sh b/jekyll-build-json/entrypoint.sh
new file mode 100755
index 0000000..0438b99
--- /dev/null
+++ b/jekyll-build-json/entrypoint.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+set -o nounset
+
+SOURCE_DIRECTORY=${GITHUB_WORKSPACE}/
+
+# Set environment variables required by supported plugins
+export JEKYLL_ENV="production"
+
+gem env
+
+bundle config set path.system true
+
+# Run the command, capturing the output
+build_output="$(bundle exec jekyll build -p /opt/jekyll/plugins)"
+
+# Capture the exit code
+exit_code=$?
+
+if [ $exit_code -ne 0 ]; then
+  # Remove the newlines from the build_output as annotation not support multiline
+  error=$(echo "$build_output" | tr '\n' ' ' | tr -s ' ')
+  echo "::error::$error"
+else
+  # Display the build_output directly
+  echo "$build_output"
+fi
+
+# Exit with the captured exit code
+exit $exit_code
diff --git a/jekyll-build-json/vespa_index_generator.rb b/jekyll-build-json/vespa_index_generator.rb
new file mode 100644
index 0000000..98a0dda
--- /dev/null
+++ b/jekyll-build-json/vespa_index_generator.rb
@@ -0,0 +1,171 @@
+# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+require 'json'
+require 'nokogiri'
+require 'kramdown/parser/kramdown'
+
+module Jekyll
+
+    class VespaIndexGenerator < Jekyll::Generator
+        priority :lowest
+        safe true
+
+        def generate(site)
+            puts "::debug::VespaIndexGenerator"
+            raise "Wrong parameter type, expected Jekyll::Site, got #{site.class}" unless site.is_a?(Jekyll::Site)
+            raise "Search configuration not found" unless site.config["search"]
+
+            search_config = site.config["search"]
+            namespace = search_config["namespace"]
+            operations = []
+            if site.pages.empty?
+                # Drop out with an error
+                puts "::error::No pages found!"
+                return nil
+            end
+
+            puts "::debug::Pages found: #{site.pages.size}"
+            site.pages.each do |page|
+                if should_skip?(search_config, page) || page.data["index"] != true
+                    puts "::debug::Page not indexed: #{page.url}, index flag: #{page.data['index']}"
+                    next
+                end
+
+                puts "::debug::Processing page: #{page.url}"
+                url = page.url
+                url += 'index.html' if url[-1, 1] == '/'
+
+                text = extract_text(page)
+                outlinks = extract_links(page)
+                headers = extract_headers(page)
+                keywords = get_keywords(page)
+
+                fields = {
+                    :path => url,
+                    :namespace => namespace,
+                    :title => page.data["title"],
+                    :content => text,
+                    :html => get_html(page),
+                    :term_count => text.split.length(),
+                    :last_updated => Time.now.to_i
+                }
+                fields[:outlinks] = outlinks if !outlinks.empty?
+                fields[:headers]  = headers  if !headers.empty?
+                fields[:keywords] = keywords if !keywords.empty?
+                operations.push({
+                    :put => "id:" + namespace + ":doc::" + namespace + url,
+                    :fields => fields
+                })
+            end
+            json = JSON.pretty_generate(operations)
+            puts "::debug::Writing index file: #{namespace}_index.json"
+            File.open(namespace + "_index.json", "w") { |f| f.write(json) }
+        end
+
+        def should_skip?(search_config, page)
+            exclude_pages = search_config["exclude_pages"] || []
+            return exclude_pages.include?(page) || is_empty(page)
+        end
+
+        def is_empty(page)
+            # The generated client-side redirects should not be indexed -
+            # they have no title and node content
+            return page.content == "" && !page.data["title"]
+        end
+
+        def get_html(page)
+            if page.name[page.name.rindex('.')+1..-1] == "md"
+                doc = Kramdown::Document.new(page.content).to_html
+            else
+                doc = page.content
+            end
+        end
+
+        def get_doc(page)
+            if page.name[page.name.rindex('.')+1..-1] == "md"
+                doc = Nokogiri::HTML(Kramdown::Document.new(page.content).to_html)
+            else
+                doc = Nokogiri::HTML(page.content)
+            end
+        end
+
+        def reset_xml_pre(doc)
+            # The highlighter works on un-quoted XML, so some docs have non-HTML elements like <services>
+            # Read and set such fields again for proper quoting and later text extraction (dirty hack ...)
+            doc.search('pre').each do |pre|
+                if pre.to_s =~ /\{% highlight xml %}/
+                    pre.content = pre.to_s.gsub("\n", " ")
+                        .gsub(/<pre>\s*\{% highlight xml %}(.+?)\{% endhighlight %}<\/pre>/, '\1')
+                end
+            end
+            return doc
+        end
+
+        def extract_text(page)
+            doc = reset_xml_pre(get_doc(page))
+            doc.search('th,td').each{ |e| e.after "\n" }
+            doc.search('style').each{ |e| e.remove }
+            content = doc.xpath("//text()").to_s
+                .gsub("\r"," ")
+                .gsub("\n"," ")
+            return strip_liquid(content)
+        end
+
+        def extract_links(page)
+            doc = get_doc(page)
+            links = doc.css('a').map { |link| link['href'] || ""}
+            links.reject{ |l| l.empty? }.map{ |l| l }
+            return links
+        end
+
+        def extract_headers(page)
+            doc = get_doc(page)
+            headers = doc.css('h1,h2,h3,h4').map { |header| header.content.gsub("\r"," ").gsub("\n"," ") || ""}
+            headers.reject{ |h| h.empty? }.map{ |h| h }
+            return headers
+        end
+
+        def get_keywords(page)
+            doc = get_doc(page)
+            keywords = []
+            if page.data["keywords"]
+                page.data["keywords"].split(/,/).each do |k|
+                    k = k.strip
+                    keywords.push(k) if ! k.empty?
+                end
+            end
+            return keywords
+        end
+
+        def strip_liquid(text)
+            return text.gsub(/\{%(.+?)%}/) { "#{ process_liquid($1) }" } # .+? is a lazy match, match only once
+        end
+
+        def process_liquid(match)
+        # https://ruby-doc.org/core-3.1.2/Regexp.html for the quotes
+        # ToDo: define the quote pattern (\"|\p{Pi}|\p{Pf}|') once and build regex using this as a parameter
+        #
+        # This is a poor man's solution to clean the data for search -
+        # the alternative is building the site and _then_ extract data
+        # That will however add jekyll build as a dependency for feeding, so keeping this simple for now
+            return match.gsub(/^\s*highlight\s*\w*/, "")
+                     .gsub(/^\s*(raw|endraw|endhighlight)/, "")
+                     .gsub(/^\s*include\s*(deprecated|important|note|query|warning).html\s*content=\s*(\"|\p{Pi}|\p{Pf}|')/, "")
+                     .gsub(/^\s*include\s*video-include.html\s.*video-title=\s*(\"|\p{Pi}|\p{Pf}|')/, "Find at vespa.ai/resources: ")
+                     .gsub(/^\s*include\s*pre-req.html\s*memory=\s*(\"|\p{Pi}|\p{Pf}|')(.*)/)  { "#{ process_pre_req($2) }" }
+                     .gsub(/(\"|\p{Pi}|\p{Pf}|')\s*$/, "")
+        end
+
+        def process_pre_req(match)
+            return match.gsub(/([0-9]*)\s*GB/, '
+                Docker: Docker Desktop for Mac/Windows, or Docker on Linux.
+                Operating system: Linux, macOS or Windows 10 Pro.
+                Architecture: x86_64 or arm64.
+                Minimum \1 GB RAM dedicated to Docker (the default is 2 GB on macOS). Memory recommendations.
+                Homebrew to install the Vespa CLI, or download Vespa CLI from Github releases.')
+                .gsub(/(\"|\p{Pi}|\p{Pf}|')\s*extra-reqs=\s*(\"|\p{Pi}|\p{Pf}|')/, "")
+        end
+
+    end
+
+end