Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] Website depth scraping data connector #1191

Merged
merged 9 commits into from
May 14, 2024
20 changes: 20 additions & 0 deletions collector/extensions/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity");
const { reqBody } = require("../utils/http");
const { validURL } = require("../utils/url");

function extensions(app) {
if (!app) return;
Expand Down Expand Up @@ -86,6 +87,25 @@ function extensions(app) {
}
);

app.post(
"/ext/website-depth",
[verifyPayloadIntegrity],
async function (request, response) {
try {
const websiteDepth = require("../utils/extensions/WebsiteDepth");
const { url, depth = 1, maxLinks = 20 } = reqBody(request);
if (!validURL(url)) return { success: false, reason: "Not a valid URL." };

const scrapedData = await websiteDepth(url, depth, maxLinks);
response.status(200).json({ success: true, data: scrapedData });
} catch (e) {
console.error(e);
response.status(400).json({ success: false, reason: e.message });
}
return;
}
);

app.post(
"/ext/confluence",
[verifyPayloadIntegrity],
Expand Down
153 changes: 153 additions & 0 deletions collector/utils/extensions/WebsiteDepth/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
const { v4 } = require("uuid");
const {
PuppeteerWebBaseLoader,
} = require("langchain/document_loaders/web/puppeteer");
const { default: slugify } = require("slugify");
const { parse } = require("node-html-parser");
const { writeToServerDocuments } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
const path = require("path");
const fs = require("fs");

async function discoverLinks(startUrl, depth = 1, maxLinks = 20) {
const baseUrl = new URL(startUrl).origin;
const discoveredLinks = new Set();
const pendingLinks = [startUrl];
let currentLevel = 0;
depth = depth < 1 ? 1 : depth;
maxLinks = maxLinks < 1 ? 1 : maxLinks;

// Check depth and if there are any links left to scrape
while (currentLevel < depth && pendingLinks.length > 0) {
const newLinks = await getPageLinks(pendingLinks[0], baseUrl);
pendingLinks.shift();

for (const link of newLinks) {
if (!discoveredLinks.has(link)) {
discoveredLinks.add(link);
pendingLinks.push(link);
}

// Exit out if we reach maxLinks
if (discoveredLinks.size >= maxLinks) {
return Array.from(discoveredLinks).slice(0, maxLinks);
}
}

if (pendingLinks.length === 0) {
currentLevel++;
}
}

return Array.from(discoveredLinks);
}

async function getPageLinks(url, baseUrl) {
try {
const loader = new PuppeteerWebBaseLoader(url, {
launchOptions: { headless: "new" },
gotoOptions: { waitUntil: "domcontentloaded" },
});
const docs = await loader.load();
const html = docs[0].pageContent;
const links = extractLinks(html, baseUrl);
return links;
} catch (error) {
console.error(`Failed to get page links from ${url}.`, error);
return [];
}
}

function extractLinks(html, baseUrl) {
const root = parse(html);
const links = root.querySelectorAll("a");
const extractedLinks = new Set();

for (const link of links) {
const href = link.getAttribute("href");
if (href) {
const absoluteUrl = new URL(href, baseUrl).href;
if (absoluteUrl.startsWith(baseUrl)) {
extractedLinks.add(absoluteUrl);
}
}
}

return Array.from(extractedLinks);
}

async function bulkScrapePages(links, outputFolder) {
const scrapedData = [];

for (let i = 0; i < links.length; i++) {
const link = links[i];
console.log(`Scraping ${i + 1}/${links.length}: ${link}`);

try {
const loader = new PuppeteerWebBaseLoader(link, {
launchOptions: { headless: "new" },
gotoOptions: { waitUntil: "domcontentloaded" },
async evaluate(page, browser) {
const result = await page.evaluate(() => document.body.innerText);
await browser.close();
return result;
},
});
const docs = await loader.load();
const content = docs[0].pageContent;

if (!content.length) {
console.warn(`Empty content for ${link}. Skipping.`);
continue;
}

const url = new URL(link);
const filename = (url.host + "-" + url.pathname).replace(".", "_");

const data = {
id: v4(),
url: "file://" + slugify(filename) + ".html",
title: slugify(filename) + ".html",
docAuthor: "no author found",
description: "No description found.",
docSource: "URL link uploaded by the user.",
chunkSource: `link://${link}`,
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
};

writeToServerDocuments(data, data.title, outputFolder);
scrapedData.push(data);

console.log(`Successfully scraped ${link}.`);
} catch (error) {
console.error(`Failed to scrape ${link}.`, error);
}
}

return scrapedData;
}

async function websiteScraper(startUrl, depth = 1, maxLinks = 20) {
const websiteName = new URL(startUrl).hostname;
const outputFolder = path.resolve(
__dirname,
`../../../../server/storage/documents/${slugify(websiteName)}`
);

fs.mkdirSync(outputFolder, { recursive: true });

console.log("Discovering links...");
const linksToScrape = await discoverLinks(startUrl, depth, maxLinks);
console.log(`Found ${linksToScrape.length} links to scrape.`);

console.log("Starting bulk scraping...");
const scrapedData = await bulkScrapePages(linksToScrape, outputFolder);
console.log(`Scraped ${scrapedData.length} pages.`);

return scrapedData;
}

module.exports = websiteScraper;
2 changes: 2 additions & 0 deletions frontend/src/components/DataConnectorOption/media/index.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import Github from "./github.svg";
import YouTube from "./youtube.svg";
import Link from "./link.svg";
import Confluence from "./confluence.jpeg";

const ConnectorImages = {
github: Github,
youtube: YouTube,
websiteDepth: Link,
confluence: Confluence,
};

Expand Down
1 change: 1 addition & 0 deletions frontend/src/components/DataConnectorOption/media/link.svg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.