Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce genAI generated broken backlink fixes #348

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
36 changes: 22 additions & 14 deletions src/backlinks/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import AhrefsAPIClient from '@adobe/spacecat-shared-ahrefs-client';
import { AbortController, AbortError } from '@adobe/fetch';
import { retrieveSiteBySiteId } from '../utils/data-access.js';
import { enhanceBacklinksWithFixes, fetch } from '../support/utils.js';
import { findSitemap } from '../sitemap/handler.js';

const TIMEOUT = 3000;

Expand Down Expand Up @@ -64,6 +65,8 @@ export default async function auditBrokenBacklinks(message, context) {
const { type, url: siteId, auditContext = {} } = message;
const { dataAccess, log, sqs } = context;
const {
AWS_REGION: region,
SPACECAT_STATISTICS_LAMBDA_ARN: statisticsServiceArn,
AUDIT_RESULTS_QUEUE_URL: queueUrl,
} = context.env;

Expand Down Expand Up @@ -100,20 +103,7 @@ export default async function auditBrokenBacklinks(message, context) {
const filteredBacklinks = result?.backlinks?.filter(
(backlink) => !excludedURLs?.includes(backlink.url_to),
);
let brokenBacklinks = await filterOutValidBacklinks(filteredBacklinks, log);
try {
const topPages = await dataAccess.getTopPagesForSite(siteId, 'ahrefs', 'global');
const keywords = topPages.map(
(page) => ({
url: page.getURL(),
keyword: page.getTopKeyword(),
traffic: page.getTraffic(),
}),
);
brokenBacklinks = enhanceBacklinksWithFixes(brokenBacklinks, keywords, log);
} catch (e) {
log.error(`Enhancing backlinks with fixes for siteId ${siteId} failed with error: ${e.message}`, e);
}
const brokenBacklinks = await filterOutValidBacklinks(filteredBacklinks, log);

auditResult = {
finalUrl: auditContext.finalUrl,
Expand Down Expand Up @@ -143,6 +133,24 @@ export default async function auditBrokenBacklinks(message, context) {
auditContext,
auditResult,
};

try {
dzehnder marked this conversation as resolved.
Show resolved Hide resolved
const baseUrl = site.getBaseURL();
const sitemaps = await findSitemap(baseUrl);
dzehnder marked this conversation as resolved.
Show resolved Hide resolved
const sitemapUrls = Object.values(sitemaps.paths)
.reduce((acc, curr) => acc.concat(curr), []);
await enhanceBacklinksWithFixes({
siteId,
brokenBacklinks: auditResult.brokenBacklinks,
sitemapUrls,
region,
statisticsServiceArn,
log,
});
} catch (e) {
log.error(`Enhancing backlinks with fixes for siteId ${siteId} failed with error: ${e.message}`, e);
}

await sqs.sendMessage(queueUrl, data);

log.info(`Successfully audited ${siteId} for ${type} type audit`);
Expand Down
98 changes: 39 additions & 59 deletions src/support/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import { hasText, resolveCustomerSecretsName } from '@adobe/spacecat-shared-util
import URI from 'urijs';
import { JSDOM } from 'jsdom';
import { GetSecretValueCommand, SecretsManagerClient } from '@aws-sdk/client-secrets-manager';
import { InvokeCommand, LambdaClient, LogType } from '@aws-sdk/client-lambda';

URI.preventInvalidHostname = true;

Expand Down Expand Up @@ -216,69 +217,48 @@ export const extractKeywordsFromUrl = (url, log) => {
};

/**
* Processes broken backlinks to find suggested URLs based on keywords.
*
* @param {Array} brokenBacklinks - The array of broken backlink objects to process.
* @param {Array} keywords - The array of keyword objects to match against.
* @param {Object} log - The logger object for logging messages.
* @returns {Array} A new array of backlink objects with suggested URLs added.
* Enhances the backlinks with fixes, triggers a Lambda function to calculate the fixes.
* @param config - The configuration object.
* @param config.siteId - The site ID.
* @param config.brokenBacklinks - The broken backlinks.
* @param config.sitemapUrls - The sitemap URLs.
* @param config.region - The AWS region.
* @param config.statisticsService - The statistics service Lambda function name.
* @param config.log - The logger.
* @returns {Promise<{status: string}>}
*/
export const enhanceBacklinksWithFixes = (brokenBacklinks, keywords, log) => {
const result = [];

for (const backlink of brokenBacklinks) {
log.info(`trying to find redirect for: ${backlink.url_to}`);
const extractedKeywords = extractKeywordsFromUrl(backlink.url_to, log);

const matchedData = [];

// Match keywords and include rank in the matched data
keywords.forEach((entry) => {
const matchingKeyword = extractedKeywords.find(
(keywordObj) => {
const regex = new RegExp(`\\b${keywordObj.keyword}\\b`, 'i');
return regex.test(entry.keyword);
},
);
if (matchingKeyword) {
matchedData.push({ ...entry, rank: matchingKeyword.rank });
}
export async function enhanceBacklinksWithFixes(config) {
const {
siteId, brokenBacklinks, sitemapUrls, region, statisticsServiceArn, log,
dzehnder marked this conversation as resolved.
Show resolved Hide resolved
} = config;

const invoke = async (funcArn, payload) => {
const client = new LambdaClient({ region });
const command = new InvokeCommand({
FunctionName: funcArn,
Payload: JSON.stringify(payload),
LogType: LogType.Tail,
InvocationType: 'Event',
});

// Try again with split keywords if no matches found
if (matchedData.length === 0) {
const splitKeywords = extractedKeywords
.map((keywordObj) => keywordObj.keyword.split(' ').map((k) => ({ keyword: k, rank: keywordObj.rank })))
.flat();

splitKeywords.forEach((keywordObj) => {
keywords.forEach((entry) => {
const regex = new RegExp(`\\b${keywordObj.keyword}\\b`, 'i');
if (regex.test(entry.keyword)) {
matchedData.push({ ...entry, rank: keywordObj.rank });
}
});
});
try {
await client.send(command);
log.info(`Lambda function ${funcArn} invoked successfully.`);
} catch (error) {
log.error(`Error invoking Lambda function ${funcArn}:`, error);
}
};

// Sort by rank and then by traffic
matchedData.sort((a, b) => {
if (b.rank === a.rank) {
return b.traffic - a.traffic; // Higher traffic ranks first
}
return a.rank - b.rank; // Higher rank ranks first (1 is highest)
});

const newBacklink = { ...backlink };
const payload = {
type: 'broken-backlinks',
payload: {
siteId,
brokenBacklinks,
sitemapUrls,
},
};

if (matchedData.length > 0) {
log.info(`found ${matchedData.length} keywords for backlink ${backlink.url_to}`);
newBacklink.url_suggested = matchedData[0].url;
} else {
log.info(`could not find suggested URL for backlink ${backlink.url_to} with keywords ${extractedKeywords.map((k) => k.keyword).join(', ')}`);
}
invoke(statisticsServiceArn, payload); // No need to await this call

result.push(newBacklink);
}
return result;
};
return { status: 'Lambda function invoked' };
}
148 changes: 36 additions & 112 deletions test/audits/backlinks.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
/* eslint-env mocha */

import { createSite } from '@adobe/spacecat-shared-data-access/src/models/site.js';
import { createSiteTopPage } from '@adobe/spacecat-shared-data-access/src/models/site-top-page.js';
import { createConfiguration } from '@adobe/spacecat-shared-data-access/src/models/configuration.js';
import { createOrganization } from '@adobe/spacecat-shared-data-access/src/models/organization.js';

Expand All @@ -22,6 +21,7 @@ import chaiAsPromised from 'chai-as-promised';
import sinon from 'sinon';
import sinonChai from 'sinon-chai';
import nock from 'nock';
import { LambdaClient } from '@aws-sdk/client-lambda';
import auditBrokenBacklinks from '../../src/backlinks/handler.js';

chai.use(sinonChai);
Expand Down Expand Up @@ -63,26 +63,6 @@ describe('Backlinks Tests', function () {

const configuration = createConfiguration(configurationData);

const siteTopPage = createSiteTopPage({
siteId: site.getId(),
url: `${site.getBaseURL()}/foo.html`,
traffic: 1000,
source: 'ahrefs',
geo: 'global',
importedAt: new Date('2024-06-18').toISOString(),
topKeyword: '404',
});

const siteTopPage2 = createSiteTopPage({
siteId: site.getId(),
url: `${site.getBaseURL()}/bar.html`,
traffic: 500,
source: 'ahrefs',
geo: 'global',
importedAt: new Date('2024-06-18').toISOString(),
topKeyword: '429',
});

const site2 = createSite({
id: 'site2',
baseURL: 'https://foo.com',
Expand Down Expand Up @@ -169,7 +149,6 @@ describe('Backlinks Tests', function () {
url_from: 'https://from.com/from-3',
url_to: 'https://foo.com/returns-429',
domain_traffic: 1000,
url_suggested: 'https://bar.foo.com/bar.html',
},
{
title: 'backlink that times out',
Expand Down Expand Up @@ -255,7 +234,6 @@ describe('Backlinks Tests', function () {
});

it('should filter out excluded URLs and include valid backlinks', async () => {
mockDataAccess.getTopPagesForSite.resolves([siteTopPage, siteTopPage2]);
mockDataAccess.getSiteByID = sinon.stub().withArgs('site1').resolves(siteWithExcludedUrls);
mockDataAccess.getConfiguration = sinon.stub().resolves(configuration);

Expand Down Expand Up @@ -287,58 +265,39 @@ describe('Backlinks Tests', function () {
);
});

it('should successfully perform an audit to detect broken backlinks, save and send the proper audit result', async () => {
it('should successfully perform an audit to detect broken backlinks, save and send the proper audit result, then trigger suggested fix lambda', async () => {
mockDataAccess.getSiteByID = sinon.stub().withArgs('site1').resolves(site);
mockDataAccess.getTopPagesForSite.resolves([]);
mockDataAccess.getConfiguration = sinon.stub().resolves(configuration);
const invokeStub = sinon.stub(LambdaClient.prototype, 'send').resolves();
const url = site.getBaseURL();

nock(site.getBaseURL())
const sampleSitemap = '<?xml version="1.0" encoding="UTF-8"?>\n'
+ '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'
+ `<url> <loc>${url}/foo</loc></url>\n`
+ `<url> <loc>${url}/bar</loc></url>\n`
+ '</urlset>';

nock(url)
.get(/.*/)
.reply(200);

nock('https://ahrefs.com')
.get(/.*/)
.reply(200, auditResult);
nock(url)
.get('/robots.txt')
.reply(200, 'Allow: /');

const expectedMessage = {
type: message.type,
url: site.getBaseURL(),
auditContext: {
finalUrl: 'bar.foo.com',
},
auditResult: {
finalUrl: 'bar.foo.com',
brokenBacklinks: auditResult.backlinks,
fullAuditRef: 'https://ahrefs.com/site-explorer/broken-backlinks?select=title%2Curl_from%2Curl_to%2Ctraffic_domain&limit=50&mode=prefix&order_by=domain_rating_source%3Adesc%2Ctraffic_domain%3Adesc&target=bar.foo.com&output=json&where=%7B%22and%22%3A%5B%7B%22field%22%3A%22is_dofollow%22%2C%22is%22%3A%5B%22eq%22%2C1%5D%7D%2C%7B%22field%22%3A%22is_content%22%2C%22is%22%3A%5B%22eq%22%2C1%5D%7D%2C%7B%22field%22%3A%22domain_rating_source%22%2C%22is%22%3A%5B%22gte%22%2C29.5%5D%7D%2C%7B%22field%22%3A%22traffic_domain%22%2C%22is%22%3A%5B%22gte%22%2C500%5D%7D%2C%7B%22field%22%3A%22links_external%22%2C%22is%22%3A%5B%22lte%22%2C300%5D%7D%5D%7D',
},
};

const response = await auditBrokenBacklinks(message, context);

expect(response.status).to.equal(204);
expect(mockDataAccess.addAudit).to.have.been.calledOnce;
expect(context.sqs.sendMessage).to.have.been.calledOnce;
expect(context.sqs.sendMessage).to.have.been
.calledWith(context.env.AUDIT_RESULTS_QUEUE_URL, expectedMessage);
expect(context.log.info).to.have.been.calledWith('Successfully audited site1 for broken-backlinks type audit');
});

it('should successfully perform an audit to detect broken backlinks based on keywords from top pages', async () => {
mockDataAccess.getSiteByID = sinon.stub().withArgs('site1').resolves(site);
mockDataAccess.getTopPagesForSite.resolves([siteTopPage, siteTopPage2]);
mockDataAccess.getConfiguration = sinon.stub().resolves(configuration);

nock(site.getBaseURL())
.get(/.*/)
nock(url)
.head('/sitemap.xml')
.reply(200);
nock(url)
.head('/sitemap_index.xml')
.reply(200);

nock('https://ahrefs.com')
.get(/.*/)
.reply(200, auditResult);

const expectedEnhancedBacklinks = auditResult.backlinks;
expectedEnhancedBacklinks[0].url_suggested = 'https://bar.foo.com/foo.html';
expectedEnhancedBacklinks[2].url_suggested = 'https://bar.foo.com/bar.html';
nock(url)
.get('/sitemap.xml')
.reply(200, sampleSitemap);

const expectedMessage = {
type: message.type,
Expand All @@ -354,62 +313,27 @@ describe('Backlinks Tests', function () {
};

const response = await auditBrokenBacklinks(message, context);
const [command] = invokeStub.getCall(0).args;
const payload = JSON.parse(command.input.Payload);

expect(response.status).to.equal(204);
expect(mockDataAccess.addAudit).to.have.been.calledOnce;
expect(context.sqs.sendMessage).to.have.been.calledOnce;
expect(context.sqs.sendMessage).to.have.been
.calledWith(context.env.AUDIT_RESULTS_QUEUE_URL, expectedMessage);
});

it('should detect broken backlinks and save the proper audit result, even if the suggested fix fails', async () => {
mockDataAccess.getSiteByID = sinon.stub().withArgs('site1').resolves(site);
mockDataAccess.getTopPagesForSite.resolves([createSiteTopPage({
siteId: site.getId(),
url: `${site.getBaseURL()}/foo.html`,
traffic: 1000,
source: 'ahrefs',
geo: 'global',
importedAt: new Date('2024-06-18').toISOString(),
topKeyword: 'c++',
})]);
const brokenBacklink = {
backlinks: [
{
title: 'backlink that has a faulty path',
url_from: 'https://from.com/from-1',
url_to: 'https://foo.com/c++',
domain_traffic: 4000,
}],
};
mockDataAccess.getConfiguration = sinon.stub().resolves(configuration);
nock(site.getBaseURL())
.get(/.*/)
.reply(200);

nock('https://ahrefs.com')
.get(/.*/)
.reply(200, brokenBacklink);

const expectedMessage = {
type: message.type,
url: site.getBaseURL(),
auditContext: {
finalUrl: 'bar.foo.com',
},
auditResult: {
finalUrl: 'bar.foo.com',
brokenBacklinks: brokenBacklink.backlinks,
fullAuditRef: 'https://ahrefs.com/site-explorer/broken-backlinks?select=title%2Curl_from%2Curl_to%2Ctraffic_domain&limit=50&mode=prefix&order_by=domain_rating_source%3Adesc%2Ctraffic_domain%3Adesc&target=bar.foo.com&output=json&where=%7B%22and%22%3A%5B%7B%22field%22%3A%22is_dofollow%22%2C%22is%22%3A%5B%22eq%22%2C1%5D%7D%2C%7B%22field%22%3A%22is_content%22%2C%22is%22%3A%5B%22eq%22%2C1%5D%7D%2C%7B%22field%22%3A%22domain_rating_source%22%2C%22is%22%3A%5B%22gte%22%2C29.5%5D%7D%2C%7B%22field%22%3A%22traffic_domain%22%2C%22is%22%3A%5B%22gte%22%2C500%5D%7D%2C%7B%22field%22%3A%22links_external%22%2C%22is%22%3A%5B%22lte%22%2C300%5D%7D%5D%7D',
expect(invokeStub.calledOnce).to.be.true;
expect(payload).to.deep.equal({
type: 'broken-backlinks',
payload: {
siteId: 'site1',
brokenBacklinks: auditResult.backlinks,
sitemapUrls: [
'https://bar.foo.com/foo',
'https://bar.foo.com/bar',
],
},
};
const response = await auditBrokenBacklinks(message, context);

expect(response.status).to.equal(204);
expect(mockDataAccess.addAudit).to.have.been.calledOnce;
expect(context.sqs.sendMessage).to.have.been.calledOnce;
expect(context.sqs.sendMessage).to.have.been
.calledWith(context.env.AUDIT_RESULTS_QUEUE_URL, expectedMessage);
});
expect(context.log.info).to.have.been.calledWith('Successfully audited site1 for broken-backlinks type audit');
});

it('should successfully perform an audit to detect broken backlinks and set finalUrl, for baseUrl redirecting to www domain', async () => {
Expand Down
Loading
Loading