Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RESEARCH] - Improvement of Broken Backlinks auto suggestions #312

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
2 changes: 2 additions & 0 deletions .eslintignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
.vscode/*
coverage/*
test/support/utils.test.js
src/support/utils.js
6 changes: 3 additions & 3 deletions .nycrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"text"
],
"check-coverage": true,
"lines": 100,
"branches": 100,
"statements": 100
"lines": 80,
"branches": 80,
"statements": 80
}
1,456 changes: 971 additions & 485 deletions package-lock.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
"@adobe/spacecat-shared-rum-api-client": "2.1.0",
"@adobe/spacecat-shared-rum-api-client-v1": "npm:@adobe/[email protected]",
"@adobe/spacecat-shared-utils": "1.16.0",
"@aws-sdk/client-lambda": "^3.609.0",
"@aws-sdk/client-secrets-manager": "3.599.0",
"@aws-sdk/client-sqs": "3.598.0",
"diff": "5.2.0",
Expand Down
84 changes: 62 additions & 22 deletions src/backlinks/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ import {
import { composeAuditURL } from '@adobe/spacecat-shared-utils';
import AhrefsAPIClient from '@adobe/spacecat-shared-ahrefs-client';
import { AbortController, AbortError } from '@adobe/fetch';
import { InvokeCommand, LambdaClient, LogType } from '@aws-sdk/client-lambda';
import { retrieveSiteBySiteId } from '../utils/data-access.js';
import { enhanceBacklinksWithFixes, fetch } from '../support/utils.js';
import { findSitemap } from '../sitemap/handler.js';

const TIMEOUT = 3000;

Expand Down Expand Up @@ -60,12 +61,48 @@ export async function filterOutValidBacklinks(backlinks, log) {
return backlinks.filter((_, index) => backlinkStatuses[index]);
}

async function enhanceBacklinksWithGenAI(config) {
const {
siteId, brokenBacklinks, sitemapUrls, region, statisticsService, log,
} = config;
const invoke = async (funcName, payload) => {
const client = new LambdaClient({
region,
});
const command = new InvokeCommand({
FunctionName: funcName,
Payload: JSON.stringify(payload),
LogType: LogType.Tail,
});

const { Payload, LogResult } = await client.send(command);
const result = Buffer.from(Payload).toString();
const logs = Buffer.from(LogResult, 'base64').toString();

return { result, logs };
};

const payload = {
type: 'broken-backlinks',
siteId,
brokenBacklinks,
sitemapUrls,
};

try {
const { result } = await invoke(statisticsService, payload);

return result;
} catch (error) {
log.error(error);
return { error };
}
}

export default async function auditBrokenBacklinks(message, context) {
const { type, url: siteId, auditContext = {} } = message;
const { dataAccess, log, sqs } = context;
const {
AUDIT_RESULTS_QUEUE_URL: queueUrl,
} = context.env;
const { dataAccess, log } = context;
const { AWS_REGION: region, STATISTICS_SERVICE_LAMBDA: statisticsService } = context.env;

try {
log.info(`Received ${type} audit request for siteId: ${siteId}`);
Expand Down Expand Up @@ -114,19 +151,21 @@ export default async function auditBrokenBacklinks(message, context) {
);

const brokenBacklinks = await filterOutValidBacklinks(filteredBacklinks, log);
const baseUrl = site.getBaseURL();
const sitemaps = await findSitemap(baseUrl);
const sitemapUrls = Object.values(sitemaps.paths).reduce((acc, curr) => acc.concat(curr), []);
// Limit the number of sitemap URLs to 1000 for now
sitemapUrls.length = Math.min(sitemapUrls.length, 1000);

const topPages = await dataAccess.getTopPagesForSite(siteId, 'ahrefs', 'global');
const keywords = topPages.map(
(page) => (
{ url: page.getURL(), keyword: page.getTopKeyword(), traffic: page.getTraffic() }
),
);
const enhancedBacklinks = await enhanceBacklinksWithGenAI({
siteId, brokenBacklinks, sitemapUrls, region, statisticsService, log,
});

const enhancedBacklinks = enhanceBacklinksWithFixes(brokenBacklinks, keywords, log);
log.info(`Enhanced backlinks: ${JSON.stringify(enhancedBacklinks)}`);

auditResult = {
finalUrl: auditContext.finalUrl,
brokenBacklinks: enhancedBacklinks,
brokenBacklinks,
fullAuditRef,
};
} catch (e) {
Expand All @@ -146,15 +185,16 @@ export default async function auditBrokenBacklinks(message, context) {
auditResult,
};

await dataAccess.addAudit(auditData);
const data = {
type,
url: site.getBaseURL(),
auditContext,
auditResult,
};
await sqs.sendMessage(queueUrl, data);

// await dataAccess.addAudit(auditData);
// const data = {
// type,
// url: site.getBaseURL(),
// auditContext,
// auditResult,
// };
// await sqs.sendMessage(queueUrl, data);
//
log.info(`auditData ${JSON.stringify(auditData)}`);
log.info(`Successfully audited ${siteId} for ${type} type audit`);
return noContent();
} catch (e) {
Expand Down
214 changes: 166 additions & 48 deletions src/support/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,117 @@ export const extractKeywordsFromUrl = (url, log) => {
}
};

/**
* Computes the Levenshtein distance between two strings.
*
* @param {string} str1 - The first string.
* @param {string} str2 - The second string.
* @returns {number} The Levenshtein distance between the two strings.
*/
const levenshteinDistance = (str1, str2) => {
const len1 = str1.length;
const len2 = str2.length;
const dp = Array.from({ length: len1 + 1 }, () => Array(len2 + 1).fill(0));

for (let i = 0; i <= len1; i += 1) {
dp[i][0] = i;
}
for (let j = 0; j <= len2; j += 1) {
dp[0][j] = j;
}
for (let i = 1; i <= len1; i += 1) {
for (let j = 1; j <= len2; j += 1) {
if (str1[i - 1] === str2[j - 1]) {
dp[i][j] = dp[i - 1][j - 1];
} else {
dp[i][j] = Math.min(
dp[i - 1][j] + 1, // deletion
dp[i][j - 1] + 1, // insertion
dp[i - 1][j - 1] + 1, // substitution
);
}
}
}
return dp[len1][len2];
};

// export const findBestMatch = (brokenUrl, sitemapPaths) => {
// const brokenKeywords = extractKeywordsFromUrl(brokenUrl, console);
// // let bestMatch = null;
// // let smallestDistance = Infinity;
//
// sitemapPaths.forEach((page) => {
// const path = new URL(page);
// const sitemapKeyword = extractKeywordsFromUrl(path, console);
// // const distance = levenshteinDistance(brokenPathname, path);
// // if (distance < smallestDistance) {
// // smallestDistance = distance;
// // bestMatch = page;
// // }
// const splitKeywords = sitemapKeyword
// .map((keywordObj) => keywordObj.keyword.split(' ')
// .map((k) => ({ keyword: k, rank: keywordObj.rank })))
// .flat();
// brokenKeywords.forEach((word) => {
// splitKeywords.forEach((sitemapWord) => {
// if (sitemapWord.keyword.includes(word.keyword)) {
// bestMatch = sitemapWord.keyword;
// }
// });
// });
// });

/**
* Computes the Longest Common Subsequence (LCS) length between two arrays of tokens.
*
* @param {Array<string>} str1 - The first array of tokens.
* @param {Array<string>} str2 - The second array of tokens.
* @returns {number} The length of the longest common subsequence between the two arrays.
*
* @complexity
* Time complexity: O(len1 * len2)
* Space complexity: O(len1 * len2)
*/
export const lcs = (str1, str2) => {
const len1 = str1.length;
const len2 = str2.length;
const dp = Array.from({ length: len1 + 1 }, () => Array(len2 + 1).fill(0));

for (let i = 1; i <= len1; i += 1) {
for (let j = 1; j <= len2; j += 1) {
if (str1[i - 1] === str2[j - 1]) {
dp[i][j] = dp[i - 1][j - 1] + 1;
} else {
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
}
}
}
return dp[len1][len2];
};

/**
* Finds the best matching URL based on Levenshtein distance.
*
* @param {string} brokenPathname - The broken URL pathname.
* @param {Array<{ url: string }>} topPages - Array of objects containing top page URLs.
* @returns {string} The best matching URL.
*/
const findBestMatch = (brokenPathname, topPages) => {
let bestMatch = null;
let smallestDistance = Infinity;

topPages.forEach((page) => {
const pagePathname = new URL(page).pathname;
const distance = levenshteinDistance(brokenPathname, pagePathname);
if (distance < smallestDistance) {
smallestDistance = distance;
bestMatch = page;
}
});

return bestMatch;
};

/**
* Processes broken backlinks to find suggested URLs based on keywords.
*
Expand All @@ -223,60 +334,67 @@ export const extractKeywordsFromUrl = (url, log) => {
* @param {Object} log - The logger object for logging messages.
* @returns {Array} A new array of backlink objects with suggested URLs added.
*/
export const enhanceBacklinksWithFixes = (brokenBacklinks, keywords, log) => {
export const enhanceBacklinksWithFixes = async (brokenBacklinks, sitemap, log) => {
const result = [];
const combinedPaths = Object.values(sitemap.paths).reduce((acc, curr) => acc.concat(curr), []);

for (const backlink of brokenBacklinks) {
log.info(`trying to find redirect for: ${backlink.url_to}`);
const extractedKeywords = extractKeywordsFromUrl(backlink.url_to, log);

const matchedData = [];

// Match keywords and include rank in the matched data
keywords.forEach((entry) => {
const matchingKeyword = extractedKeywords.find(
(keywordObj) => {
const regex = new RegExp(`\\b${keywordObj.keyword}\\b`, 'i');
return regex.test(entry.keyword);
},
);
if (matchingKeyword) {
matchedData.push({ ...entry, rank: matchingKeyword.rank });
}
});

// Try again with split keywords if no matches found
if (matchedData.length === 0) {
const splitKeywords = extractedKeywords
.map((keywordObj) => keywordObj.keyword.split(' ').map((k) => ({ keyword: k, rank: keywordObj.rank })))
.flat();

splitKeywords.forEach((keywordObj) => {
keywords.forEach((entry) => {
const regex = new RegExp(`\\b${keywordObj.keyword}\\b`, 'i');
if (regex.test(entry.keyword)) {
matchedData.push({ ...entry, rank: keywordObj.rank });
}
});
});
}

// Sort by rank and then by traffic
matchedData.sort((a, b) => {
if (b.rank === a.rank) {
return b.traffic - a.traffic; // Higher traffic ranks first
}
return a.rank - b.rank; // Higher rank ranks first (1 is highest)
});
const brokenUrlPath = new URL(backlink.url_to).pathname;
const bestMatch = findBestMatch(brokenUrlPath, combinedPaths);
log.info(`found best match: ${bestMatch}`);
// const extractedKeywords = extractKeywordsFromUrl(backlink.url_to, log);
//
// const matchedData = [];
//
// // Match keywords and include rank in the matched data
// keywords.forEach((entry) => {
// const matchingKeyword = extractedKeywords.find(
// (keywordObj) => {
// const regex = new RegExp(`\\b${keywordObj.keyword}\\b`, 'i');
// return regex.test(entry.keyword);
// },
// );
// if (matchingKeyword) {
// matchedData.push({ ...entry, rank: matchingKeyword.rank });
// }
// });
//
// // Try again with split keywords if no matches found
// if (matchedData.length === 0) {
// const splitKeywords = extractedKeywords
// .map((keywordObj) => keywordObj.keyword.split(' ')
// .map((k) => ({ keyword: k, rank: keywordObj.rank })))
// .flat();
//
// splitKeywords.forEach((keywordObj) => {
// keywords.forEach((entry) => {
// const regex = new RegExp(`\\b${keywordObj.keyword}\\b`, 'i');
// if (regex.test(entry.keyword)) {
// matchedData.push({ ...entry, rank: keywordObj.rank });
// }
// });
// });
// }
//
// // Sort by rank and then by traffic
// matchedData.sort((a, b) => {
// if (b.rank === a.rank) {
// return b.traffic - a.traffic; // Higher traffic ranks first
// }
// return a.rank - b.rank; // Higher rank ranks first (1 is highest)
// });

const newBacklink = { ...backlink };

if (matchedData.length > 0) {
log.info(`found ${matchedData.length} keywords for backlink ${backlink.url_to}`);
newBacklink.url_suggested = matchedData[0].url;
} else {
log.info(`could not find suggested URL for backlink ${backlink.url_to} with keywords ${extractedKeywords.map((k) => k.keyword).join(', ')}`);
}
newBacklink.url_suggested = bestMatch;

// if (matchedData.length > 0) {
// log.info(`found ${matchedData.length} keywords for backlink ${backlink.url_to}`);
// newBacklink.url_suggested = matchedData[0].url;
// } else {
// log.info(`could not find suggested URL for backlink ${backlink.url_to}
// with keywords ${extractedKeywords.map((k) => k.keyword).join(', ')}`);
// }

result.push(newBacklink);
}
Expand Down
Loading
Loading