Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: verify sitemap urls #266

Merged
merged 6 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 46 additions & 11 deletions src/sitemap/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -146,17 +146,35 @@ export async function checkSitemap(sitemapUrl) {
}

/**
* Checks for common sitemap URLs and returns any that exist.
* @param {Array<string>} urls - Array of URLs to check.
* @returns {Promise<Array<string>>} - List of sitemap URLs that exist.
* Filters a list of URLs to return only those that exist.
*
* @async
* @param {string[]} urls - An array of URLs to check.
* @param {Object} log - The logging object to record information and errors.
* @returns {Promise<string[]>} - A promise that resolves to an array of URLs that exist.
*/
async function checkCommonSitemapUrls(urls) {
async function filterValidUrls(urls, log) {
const fetchPromises = urls.map(async (url) => {
const response = await fetch(url, { method: 'HEAD' });
return response.ok ? url : null;
try {
const response = await fetch(url, { method: 'HEAD' });
if (response.ok) {
return url;
} else {
log.info(`URL ${url} returned status code ${response.status}`);
return null;
}
} catch (error) {
log.error(`Failed to fetch URL ${url}: ${error.message}`);
return null;
}
});
const results = await Promise.all(fetchPromises);
return results.filter((url) => url !== null);

const results = await Promise.allSettled(fetchPromises);

// filter only the fulfilled promises that have a valid URL
return results
.filter((result) => result.status === 'fulfilled' && result.value !== null)
.map((result) => result.value);
}

/**
Expand Down Expand Up @@ -232,9 +250,10 @@ export async function getBaseUrlPagesFromSitemaps(baseUrl, urls) {
* The extracted paths response length < 0, log messages and returns the failure status and reasons.
*
* @param {string} inputUrl - The URL for which to find and validate the sitemap
* @param log
* @returns {Promise<{success: boolean, reasons: Array<{value}>, paths?: any}>} result of sitemap
*/
export async function findSitemap(inputUrl) {
export async function findSitemap(inputUrl, log) {
const logMessages = [];

const parsedUrl = extractDomainAndProtocol(inputUrl);
Expand Down Expand Up @@ -263,7 +282,7 @@ export async function findSitemap(inputUrl) {

if (!sitemapUrls.length) {
const commonSitemapUrls = [`${protocol}://${domain}/sitemap.xml`, `${protocol}://${domain}/sitemap_index.xml`];
sitemapUrls = await checkCommonSitemapUrls(commonSitemapUrls);
sitemapUrls = await filterValidUrls(commonSitemapUrls, log);
if (!sitemapUrls.length) {
logMessages.push({ value: `No sitemap found in robots.txt or common paths for ${protocol}://${domain}`, error: ERROR_CODES.NO_SITEMAP_IN_ROBOTS });
return { success: false, reasons: logMessages };
Expand All @@ -276,6 +295,22 @@ export async function findSitemap(inputUrl) {
);
const extractedPaths = await getBaseUrlPagesFromSitemaps(inputUrl, filteredSitemapUrls);

// check if URLs from each sitemap exist and remove entries if none exist
if (Object.entries(extractedPaths).length > 0) {
const extractedSitemapUrls = Object.keys(extractedPaths);
for (const s of extractedSitemapUrls) {
const urlsToCheck = extractedPaths[s];
// eslint-disable-next-line no-await-in-loop
const existingPages = await filterValidUrls(urlsToCheck, log);

if (existingPages.length === 0) {
delete extractedPaths[s];
} else {
extractedPaths[s] = existingPages;
}
}
}

if (Object.entries(extractedPaths).length > 0) {
logMessages.push({ value: 'Sitemaps found and validated successfully.' });
return {
Expand All @@ -299,7 +334,7 @@ export async function sitemapAuditRunner(baseURL, context) {
const { log } = context;
log.info(`Received sitemap audit request for ${baseURL}`);
const startTime = process.hrtime();
const auditResult = await findSitemap(baseURL);
const auditResult = await findSitemap(baseURL, log);

const endTime = process.hrtime(startTime);
const elapsedSeconds = endTime[0] + endTime[1] / 1e9;
Expand Down
175 changes: 148 additions & 27 deletions test/audits/sitemap.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@ describe('Sitemap Audit', () => {
+ `<sitemap><loc>${url}/sitemap_bar.xml</loc></sitemap>\n`
+ '</sitemapindex>';

let topPagesResponse;

beforeEach('setup', () => {
context = new MockContextBuilder()
.withSandbox(sandbox)
Expand All @@ -78,29 +76,6 @@ describe('Sitemap Audit', () => {
nock(url)
.get('/sitemap_bar.xml')
.reply(200, sampleSitemapTwo);

topPagesResponse = {
result: {
pages: [
{
url: `${url}/foo`,
sum_traffic: 100,
},
{
url: `${url}/bar`,
sum_traffic: 200,
},
{
url: `${url}/baz`,
sum_traffic: 300,
},
{
url: `${url}/cux`,
sum_traffic: 400,
},
],
},
};
});

afterEach(() => {
Expand All @@ -115,6 +90,30 @@ describe('Sitemap Audit', () => {
.get('/robots.txt')
.reply(200, `Sitemap: ${url}/sitemap_foo.xml\nSitemap: ${url}/sitemap_bar.xml`);

nock(url)
.head('/sitemap_foo.xml')
.reply(200);

nock(url)
.head('/sitemap_bar.xml')
.reply(200);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

nock(url)
.head('/baz')
.reply(200);

nock(url)
.head('/cux')
.reply(200);

const result = await sitemapAuditRunner(url, context);
expect(result).to.eql({
auditResult: {
Expand All @@ -137,9 +136,35 @@ describe('Sitemap Audit', () => {
nock(url)
.get('/robots.txt')
.reply(200, `Sitemap: ${url}/sitemap_index.xml`);

nock(url)
.get('/sitemap_index.xml')
.reply(200, sitemapIndex);

nock(url)
.head('/sitemap_foo.xml')
.reply(200);

nock(url)
.head('/sitemap_bar.xml')
.reply(200);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

nock(url)
.head('/baz')
.reply(200);

nock(url)
.head('/cux')
.reply(200);

const result = await sitemapAuditRunner(url, context);
expect(result).to.eql({
auditResult: {
Expand All @@ -166,10 +191,27 @@ describe('Sitemap Audit', () => {
nock(url)
.get('/sitemap_foo.txt')
.reply(200, `${url}/foo\n${url}/bar`, { 'content-type': 'text/plain' });

nock(url)
.get('/sitemap_bar.txt')
.reply(200, `${url}/baz\n${url}/cux`, { 'content-type': 'text/plain' });

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

nock(url)
.head('/baz')
.reply(200);

nock(url)
.head('/cux')
.reply(200);

const result = await sitemapAuditRunner(url, context);
expect(result).to.eql({
auditResult: {
Expand Down Expand Up @@ -205,6 +247,14 @@ describe('Sitemap Audit', () => {
.get('/sitemap.xml')
.reply(200, sampleSitemap);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

const result = await sitemapAuditRunner(url, context);
expect(result).to.eql({
auditResult: {
Expand Down Expand Up @@ -442,6 +492,32 @@ describe('Sitemap Audit', () => {
}]);
});

it('should delete extracted paths when no valid pages exist', async () => {
nock(url)
.get('/robots.txt')
.reply(200, `Sitemap: ${url}/sitemap.xml`);

nock(url)
.get('/sitemap.xml')
.reply(200, sampleSitemap);

nock(url)
.head('/foo')
.reply(404);

nock(url)
.head('/bar')
.reply(404);

const result = await findSitemap(url);

expect(result.success).to.equal(false);
expect(result.reasons).to.deep.include({
value: 'No valid paths extracted from sitemaps.',
error: ERROR_CODES.NO_PATHS_IN_SITEMAP,
});
});

it('should return success when sitemap is found in robots.txt', async () => {
nock(url)
.get('/robots.txt')
Expand All @@ -451,6 +527,14 @@ describe('Sitemap Audit', () => {
.get('/sitemap.xml')
.reply(200, sampleSitemap);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

const result = await findSitemap(url);
expect(result.success).to.equal(true);
expect(result.paths).to.deep.equal({
Expand All @@ -475,6 +559,14 @@ describe('Sitemap Audit', () => {
.get('/sitemap.xml')
.reply(200, sampleSitemap);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

const result = await findSitemap('https://some-domain.adobe');
expect(result.success).to.equal(true);
expect(result.paths).to.deep.equal({
Expand All @@ -499,6 +591,30 @@ describe('Sitemap Audit', () => {
.get('/sitemap_index.xml')
.reply(200, sitemapIndex);

nock(url)
.head('/sitemap_foo.xml')
.reply(200);

nock(url)
.head('/sitemap_bar.xml')
.reply(200);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

nock(url)
.head('/baz')
.reply(200);

nock(url)
.head('/cux')
.reply(200);

const result = await findSitemap(url);
expect(result.success).to.equal(true);
expect(result.paths).to.deep.equal({
Expand All @@ -516,8 +632,13 @@ describe('Sitemap Audit', () => {
.get('/sitemap.xml')
.reply(200, sampleSitemapMoreUrlsWWW);

topPagesResponse.result.pages[0].url = `${protocol}://www.${domain}/foo`;
topPagesResponse.result.pages[1].url = `${protocol}://www.${domain}/bar`;
nock(`${protocol}://www.${domain}`)
.head('/foo')
.reply(200);

nock(`${protocol}://www.${domain}`)
.head('/bar')
.reply(200);

const result = await findSitemap(`${protocol}://www.${domain}`);
expect(result.success).to.equal(true);
Expand Down
Loading