From bda8eda42f623750080076afe62fb350f9c85f13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vin=C3=ADcius=20R=2E=20Miguel?= Date: Thu, 19 Oct 2023 14:39:29 -0300 Subject: [PATCH 1/2] registry: add parsing of Markdown images --- registry/Cargo.lock | 1 + registry/Cargo.toml | 1 + registry/src/readme.rs | 89 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/registry/Cargo.lock b/registry/Cargo.lock index 4ff6f764..317a3abf 100644 --- a/registry/Cargo.lock +++ b/registry/Cargo.lock @@ -2879,6 +2879,7 @@ dependencies = [ "dotenv", "env_logger", "futures", + "nom", "rand", "reqwest", "semver", diff --git a/registry/Cargo.toml b/registry/Cargo.toml index ea1ab8fe..a79489e6 100644 --- a/registry/Cargo.toml +++ b/registry/Cargo.toml @@ -31,3 +31,4 @@ rand = "0.8.5" sha2 = "0.10.6" tracing = "0.1.37" tracing-subscriber = "0.3.17" +nom = "7.1.3" diff --git a/registry/src/readme.rs b/registry/src/readme.rs index 9231b294..f49b113c 100644 --- a/registry/src/readme.rs +++ b/registry/src/readme.rs @@ -37,6 +37,55 @@ mod b64 { } } +mod markdown_parsing { + use nom::{ + bytes::complete::take_while, character::complete::char, sequence::delimited, IResult, + }; + + /// A Markdown image declaration of the form `![alt-text](path-or-link)`. + #[derive(Debug)] + #[cfg_attr(test, derive(PartialEq))] + pub struct MarkdownImage<'a> { + pub alt_text: &'a str, + pub path_or_link: &'a str, + } + + fn parse_delimited(input: &str, start: char, end: char) -> IResult<&str, &str> { + delimited(char(start), take_while(|ch| ch != end), char(end))(input) + } + + fn parse_markdown_image(input: &str) -> IResult<&str, MarkdownImage<'_>> { + let (remaining, _) = char('!')(input)?; + let (remaining, alt_text) = parse_delimited(remaining, '[', ']')?; + let (remaining, path_or_link) = parse_delimited(remaining, '(', ')')?; + + Ok(( + remaining, + MarkdownImage { + alt_text, + path_or_link, + }, + )) + } + + pub fn parse_markdown_images(input: &str) -> Vec> { + let mut images = Vec::new(); + for (idx, _) in input.match_indices('!') { + let remaining = &input[idx..]; + + if let Ok((_, image)) = parse_markdown_image(remaining) { + // We're only interested in relative paths, so if we parsed a URL, skip it + if image.path_or_link.starts_with("http") { + continue; + } + images.push(image); + } + } + + images + } +} + pub struct GithubApiClient { token: String, client: Client, @@ -96,7 +145,10 @@ impl GithubApiClient { if is_markdown(&file_name) { // We already got the README in Markdown, although base64-encoded - String::from_utf8(b64::ws_decode(content)?).map_err(Into::into) + let readme = String::from_utf8(b64::ws_decode(content)?)?; + + // One problem now is that READMEs often link images to within the repository instead of some HTTP URL. + Ok(readme) } else { self.get_text(&readme_url).await } @@ -169,6 +221,41 @@ pub async fn fetch_and_save_readme( mod tests { use crate::readme::GitHubProject; + #[test] + fn parses_markdown_images() { + use super::markdown_parsing::{parse_markdown_images, MarkdownImage}; + + let test_case = r#" + | **
The Citus database is 100% open source.

Learn what's new in the [Citus 12.1 release blog](https://www.citusdata.com/blog/2023/09/22/adding-postgres-16-support-to-citus-12-1/) and the [Citus Updates page](https://www.citusdata.com/updates/).

**| + |---| +
+ + + + ![Citus Banner](images/citus-readme-banner.png) + + [![Latest Docs](https://img.shields.io/badge/docs-latest-brightgreen.svg)](https://docs.citusdata.com/) + [![Stack Overflow](https://img.shields.io/badge/Stack%20Overflow-%20-545353?logo=Stack%20Overflow)](https://stackoverflow.com/questions/tagged/citus) + [![Slack](https://cituscdn.azureedge.net/images/social/slack-badge.svg)](https://slack.citusdata.com/) + ![Tembo Banner](images/tembo-readme-banner.png) + [![Code Coverage](https://codecov.io/gh/citusdata/citus/branch/master/graph/badge.svg)](https://app.codecov.io/gh/citusdata/citus) + [![Twitter](https://img.shields.io/twitter/follow/citusdata.svg?label=Follow%20@citusdata)](https://twitter.com/intent/follow?screen_name=citusdata)"#; + + assert_eq!( + parse_markdown_images(test_case), + &[ + MarkdownImage { + alt_text: "Citus Banner", + path_or_link: "images/citus-readme-banner.png" + }, + MarkdownImage { + alt_text: "Tembo Banner", + path_or_link: "images/tembo-readme-banner.png" + } + ] + ); + } + #[test] fn parses_github_urls() { let pgmq = "https://github.com/tembo-io/pgmq"; From 1250f71600966e8217e63d418915c7d2e4d206dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vin=C3=ADcius=20R=2E=20Miguel?= Date: Thu, 19 Oct 2023 14:48:16 -0300 Subject: [PATCH 2/2] registry: try to replace relative paths in README images --- registry/Cargo.lock | 5 +-- registry/Cargo.toml | 1 + registry/src/readme.rs | 71 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 74 insertions(+), 3 deletions(-) diff --git a/registry/Cargo.lock b/registry/Cargo.lock index 317a3abf..912635f5 100644 --- a/registry/Cargo.lock +++ b/registry/Cargo.lock @@ -283,9 +283,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.0.1" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ "memchr", ] @@ -2870,6 +2870,7 @@ dependencies = [ "actix-multipart", "actix-web", "actix-web-httpauth", + "aho-corasick", "anyhow", "aws-config", "aws-sdk-s3", diff --git a/registry/Cargo.toml b/registry/Cargo.toml index a79489e6..390a4e9f 100644 --- a/registry/Cargo.toml +++ b/registry/Cargo.toml @@ -32,3 +32,4 @@ sha2 = "0.10.6" tracing = "0.1.37" tracing-subscriber = "0.3.17" nom = "7.1.3" +aho-corasick = "1.1.2" diff --git a/registry/src/readme.rs b/registry/src/readme.rs index f49b113c..32b5544e 100644 --- a/registry/src/readme.rs +++ b/registry/src/readme.rs @@ -1,5 +1,6 @@ use std::path::Path; +use aho_corasick::AhoCorasick; use reqwest::Client; use serde::{de::DeserializeOwned, Deserialize}; @@ -8,6 +9,8 @@ use crate::{ repository::Registry, }; +use self::markdown_parsing::{parse_markdown_images, MarkdownImage}; + fn is_markdown(file_name: &str) -> bool { let maybe_extension = Path::new(file_name).extension(); @@ -148,11 +151,62 @@ impl GithubApiClient { let readme = String::from_utf8(b64::ws_decode(content)?)?; // One problem now is that READMEs often link images to within the repository instead of some HTTP URL. - Ok(readme) + // We will look for relative paths in the README and attempt to replace them with links that could be rendered in pgt.dev + Ok(self.prepare_readme(readme, project).await) } else { self.get_text(&readme_url).await } } + + async fn prepare_readme(&self, readme: String, project: GitHubProject<'_>) -> String { + let relative_images = parse_markdown_images(&readme); + + if relative_images.is_empty() { + return readme; + } + + match self + .replace_relative_images(&readme, relative_images, project) + .await + { + Ok(updated_readme) => updated_readme, + Err(err) => { + tracing::warn!("Failed to update README: {err}, continuing."); + readme + } + } + } + + async fn replace_relative_images<'a, 'b>( + &self, + readme: &'a str, + images: Vec>, + project: GitHubProject<'b>, + ) -> anyhow::Result { + let mut patterns = Vec::new(); + let mut replace_with = Vec::new(); + + #[derive(Deserialize)] + struct Response { + download_url: String, + } + + for image in images { + let path = image.path_or_link; + let url = project.build_content_url(path); + let Ok(response) = self.get_json::(&url).await else { + continue; + }; + + patterns.push(path); + replace_with.push(response.download_url); + } + + let ac = AhoCorasick::new(patterns)?; + + ac.try_replace_all(readme, &replace_with) + .map_err(Into::into) + } } #[derive(Debug, PartialEq)] @@ -186,6 +240,9 @@ impl<'a> GitHubProject<'a> { parse(url).ok_or_else(|| ExtensionRegistryError::InvalidGithubRepo(url.into())) } + /// Builds the URL for these endpoints: + /// * https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28#get-repository-content + /// * https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28#get-a-repository-readme-for-a-directory fn build_readme_url(&self) -> String { let Self { owner, @@ -200,6 +257,18 @@ impl<'a> GitHubProject<'a> { _ => format!("https://api.github.com/repos/{owner}/{name}/readme"), } } + + /// Builds the URL for the following endpoint: + /// * https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28#get-repository-content + fn build_content_url(&self, path: &str) -> String { + let Self { + owner, + name, + subdir: _, + } = *self; + + format!("https://api.github.com/repos/{owner}/{name}/contents/{path}") + } } pub async fn fetch_and_save_readme(