Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

registry: try to replace relative paths in README images #500

Merged
merged 2 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions registry/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions registry/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,5 @@ rand = "0.8.5"
sha2 = "0.10.6"
tracing = "0.1.37"
tracing-subscriber = "0.3.17"
nom = "7.1.3"
aho-corasick = "1.1.2"
158 changes: 157 additions & 1 deletion registry/src/readme.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::path::Path;

use aho_corasick::AhoCorasick;
use reqwest::Client;
use serde::{de::DeserializeOwned, Deserialize};

Expand All @@ -8,6 +9,8 @@ use crate::{
repository::Registry,
};

use self::markdown_parsing::{parse_markdown_images, MarkdownImage};

fn is_markdown(file_name: &str) -> bool {
let maybe_extension = Path::new(file_name).extension();

Expand Down Expand Up @@ -37,6 +40,55 @@ mod b64 {
}
}

mod markdown_parsing {
use nom::{
bytes::complete::take_while, character::complete::char, sequence::delimited, IResult,
};

/// A Markdown image declaration of the form `![alt-text](path-or-link)`.
#[derive(Debug)]
#[cfg_attr(test, derive(PartialEq))]
pub struct MarkdownImage<'a> {
pub alt_text: &'a str,
pub path_or_link: &'a str,
}

fn parse_delimited(input: &str, start: char, end: char) -> IResult<&str, &str> {
delimited(char(start), take_while(|ch| ch != end), char(end))(input)
}

fn parse_markdown_image(input: &str) -> IResult<&str, MarkdownImage<'_>> {
let (remaining, _) = char('!')(input)?;
let (remaining, alt_text) = parse_delimited(remaining, '[', ']')?;
let (remaining, path_or_link) = parse_delimited(remaining, '(', ')')?;

Ok((
remaining,
MarkdownImage {
alt_text,
path_or_link,
},
))
}

pub fn parse_markdown_images(input: &str) -> Vec<MarkdownImage<'_>> {
let mut images = Vec::new();
for (idx, _) in input.match_indices('!') {
let remaining = &input[idx..];

if let Ok((_, image)) = parse_markdown_image(remaining) {
// We're only interested in relative paths, so if we parsed a URL, skip it
if image.path_or_link.starts_with("http") {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great job catching this case - lots of repos will have urls like this: https://github.com/DarrenBaldwin07/DarrenBaldwin07/assets/68653294/a53fce84-b120-489d-a795-12be94e67641

continue;
}
images.push(image);
}
}

images
}
}

pub struct GithubApiClient {
token: String,
client: Client,
Expand Down Expand Up @@ -96,11 +148,65 @@ impl GithubApiClient {

if is_markdown(&file_name) {
// We already got the README in Markdown, although base64-encoded
String::from_utf8(b64::ws_decode(content)?).map_err(Into::into)
let readme = String::from_utf8(b64::ws_decode(content)?)?;

// One problem now is that READMEs often link images to within the repository instead of some HTTP URL.
// We will look for relative paths in the README and attempt to replace them with links that could be rendered in pgt.dev
Ok(self.prepare_readme(readme, project).await)
} else {
self.get_text(&readme_url).await
}
}

async fn prepare_readme(&self, readme: String, project: GitHubProject<'_>) -> String {
let relative_images = parse_markdown_images(&readme);

if relative_images.is_empty() {
return readme;
}

match self
.replace_relative_images(&readme, relative_images, project)
.await
{
Ok(updated_readme) => updated_readme,
Err(err) => {
tracing::warn!("Failed to update README: {err}, continuing.");
readme
}
}
}

async fn replace_relative_images<'a, 'b>(
&self,
readme: &'a str,
images: Vec<MarkdownImage<'a>>,
project: GitHubProject<'b>,
) -> anyhow::Result<String> {
let mut patterns = Vec::new();
let mut replace_with = Vec::new();

#[derive(Deserialize)]
struct Response {
download_url: String,
}

for image in images {
let path = image.path_or_link;
let url = project.build_content_url(path);
let Ok(response) = self.get_json::<Response>(&url).await else {
continue;
};

patterns.push(path);
replace_with.push(response.download_url);
}

let ac = AhoCorasick::new(patterns)?;

ac.try_replace_all(readme, &replace_with)
.map_err(Into::into)
}
}

#[derive(Debug, PartialEq)]
Expand Down Expand Up @@ -134,6 +240,9 @@ impl<'a> GitHubProject<'a> {
parse(url).ok_or_else(|| ExtensionRegistryError::InvalidGithubRepo(url.into()))
}

/// Builds the URL for these endpoints:
/// * https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28#get-repository-content
/// * https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28#get-a-repository-readme-for-a-directory
fn build_readme_url(&self) -> String {
let Self {
owner,
Expand All @@ -148,6 +257,18 @@ impl<'a> GitHubProject<'a> {
_ => format!("https://api.github.com/repos/{owner}/{name}/readme"),
}
}

/// Builds the URL for the following endpoint:
/// * https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28#get-repository-content
fn build_content_url(&self, path: &str) -> String {
let Self {
owner,
name,
subdir: _,
} = *self;

format!("https://api.github.com/repos/{owner}/{name}/contents/{path}")
}
}

pub async fn fetch_and_save_readme(
Expand All @@ -169,6 +290,41 @@ pub async fn fetch_and_save_readme(
mod tests {
use crate::readme::GitHubProject;

#[test]
fn parses_markdown_images() {
use super::markdown_parsing::{parse_markdown_images, MarkdownImage};

let test_case = r#"
| **<br/>The Citus database is 100% open source.<br/><img width=1000/><br/>Learn what's new in the [Citus 12.1 release blog](https://www.citusdata.com/blog/2023/09/22/adding-postgres-16-support-to-citus-12-1/) and the [Citus Updates page](https://www.citusdata.com/updates/).<br/><br/>**|
|---|
<br/>



![Citus Banner](images/citus-readme-banner.png)

[![Latest Docs](https://img.shields.io/badge/docs-latest-brightgreen.svg)](https://docs.citusdata.com/)
[![Stack Overflow](https://img.shields.io/badge/Stack%20Overflow-%20-545353?logo=Stack%20Overflow)](https://stackoverflow.com/questions/tagged/citus)
[![Slack](https://cituscdn.azureedge.net/images/social/slack-badge.svg)](https://slack.citusdata.com/)
![Tembo Banner](images/tembo-readme-banner.png)
[![Code Coverage](https://codecov.io/gh/citusdata/citus/branch/master/graph/badge.svg)](https://app.codecov.io/gh/citusdata/citus)
[![Twitter](https://img.shields.io/twitter/follow/citusdata.svg?label=Follow%20@citusdata)](https://twitter.com/intent/follow?screen_name=citusdata)"#;

assert_eq!(
parse_markdown_images(test_case),
&[
MarkdownImage {
alt_text: "Citus Banner",
path_or_link: "images/citus-readme-banner.png"
},
MarkdownImage {
alt_text: "Tembo Banner",
path_or_link: "images/tembo-readme-banner.png"
}
]
);
}

#[test]
fn parses_github_urls() {
let pgmq = "https://github.com/tembo-io/pgmq";
Expand Down
Loading