2024-06-26 12:54:03 -05:00
|
|
|
import { LaunchOptions, Page, chromium, devices } from "playwright";
|
2023-12-13 16:32:01 -06:00
|
|
|
import { prisma } from "./db";
|
|
|
|
import createFile from "./storage/createFile";
|
2024-03-26 00:38:08 -05:00
|
|
|
import sendToWayback from "./preservationScheme/sendToWayback";
|
2023-12-13 16:32:01 -06:00
|
|
|
import { Collection, Link, User } from "@prisma/client";
|
|
|
|
import validateUrlSize from "./validateUrlSize";
|
2023-12-24 05:46:08 -06:00
|
|
|
import createFolder from "./storage/createFolder";
|
2024-04-01 01:56:54 -05:00
|
|
|
import generatePreview from "./generatePreview";
|
2024-04-08 18:35:06 -05:00
|
|
|
import { removeFiles } from "./manageLinkFiles";
|
2024-03-26 00:38:08 -05:00
|
|
|
import archiveAsSinglefile from "./preservationScheme/archiveAsSinglefile";
|
|
|
|
import archiveAsReadability from "./preservationScheme/archiveAsReadablility";
|
2024-06-26 12:54:03 -05:00
|
|
|
import shell from "shelljs";
|
2023-12-13 16:32:01 -06:00
|
|
|
|
|
|
|
type LinksAndCollectionAndOwner = Link & {
|
|
|
|
collection: Collection & {
|
|
|
|
owner: User;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
2023-12-29 22:59:00 -06:00
|
|
|
const BROWSER_TIMEOUT = Number(process.env.BROWSER_TIMEOUT) || 5;
|
|
|
|
|
2023-12-13 16:32:01 -06:00
|
|
|
export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
2024-03-26 00:38:08 -05:00
|
|
|
const timeoutPromise = new Promise((_, reject) => {
|
|
|
|
setTimeout(
|
|
|
|
() =>
|
|
|
|
reject(
|
|
|
|
new Error(
|
|
|
|
`Browser has been open for more than ${BROWSER_TIMEOUT} minutes.`
|
|
|
|
)
|
|
|
|
),
|
|
|
|
BROWSER_TIMEOUT * 60000
|
|
|
|
);
|
|
|
|
});
|
|
|
|
|
2024-02-18 15:42:51 -06:00
|
|
|
// allow user to configure a proxy
|
|
|
|
let browserOptions: LaunchOptions = {};
|
2024-02-19 14:38:36 -06:00
|
|
|
if (process.env.PROXY) {
|
2024-02-18 15:42:51 -06:00
|
|
|
browserOptions.proxy = {
|
2024-02-19 14:38:36 -06:00
|
|
|
server: process.env.PROXY,
|
|
|
|
bypass: process.env.PROXY_BYPASS,
|
|
|
|
username: process.env.PROXY_USERNAME,
|
|
|
|
password: process.env.PROXY_PASSWORD,
|
|
|
|
};
|
2024-02-18 15:42:51 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
const browser = await chromium.launch(browserOptions);
|
2024-01-29 02:49:50 -06:00
|
|
|
const context = await browser.newContext({
|
|
|
|
...devices["Desktop Chrome"],
|
2024-02-10 18:34:25 -06:00
|
|
|
ignoreHTTPSErrors: process.env.IGNORE_HTTPS_ERRORS === "true",
|
2024-01-29 02:49:50 -06:00
|
|
|
});
|
2024-02-18 16:02:35 -06:00
|
|
|
|
2023-12-13 16:32:01 -06:00
|
|
|
const page = await context.newPage();
|
|
|
|
|
2024-06-26 12:54:03 -05:00
|
|
|
// await page.goto("https://github.com", {
|
|
|
|
// waitUntil: "domcontentloaded",
|
|
|
|
// });
|
|
|
|
|
|
|
|
// console.log("Opening page:", link.url);
|
|
|
|
|
|
|
|
// await page.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30);
|
|
|
|
|
|
|
|
// const dom = await page.content();
|
|
|
|
|
|
|
|
// console.log("The content", dom);
|
|
|
|
|
|
|
|
// shell
|
|
|
|
// .echo(dom)
|
|
|
|
// .exec(
|
|
|
|
// "monolith - -I -b https://marketplace.visualstudio.com/items?itemName=42Crunch.vscode-openapi -j -F -o monolith.html"
|
|
|
|
// );
|
|
|
|
|
|
|
|
// console.log("Monolith created!");
|
|
|
|
|
2024-04-08 18:35:06 -05:00
|
|
|
createFolder({
|
|
|
|
filePath: `archives/preview/${link.collectionId}`,
|
|
|
|
});
|
|
|
|
|
|
|
|
createFolder({
|
|
|
|
filePath: `archives/${link.collectionId}`,
|
|
|
|
});
|
2023-12-29 22:59:00 -06:00
|
|
|
|
2023-12-13 16:32:01 -06:00
|
|
|
try {
|
2023-12-29 22:59:00 -06:00
|
|
|
await Promise.race([
|
|
|
|
(async () => {
|
2024-03-26 00:38:08 -05:00
|
|
|
const user = link.collection?.owner;
|
|
|
|
|
2023-12-29 22:59:00 -06:00
|
|
|
const validatedUrl = link.url
|
|
|
|
? await validateUrlSize(link.url)
|
|
|
|
: undefined;
|
|
|
|
|
2024-04-17 17:18:50 -05:00
|
|
|
if (
|
|
|
|
validatedUrl === null &&
|
|
|
|
process.env.IGNORE_URL_SIZE_LIMIT !== "true"
|
|
|
|
)
|
2024-01-17 09:30:35 -06:00
|
|
|
throw "Something went wrong while retrieving the file size.";
|
2023-12-29 22:59:00 -06:00
|
|
|
|
|
|
|
const contentType = validatedUrl?.get("content-type");
|
|
|
|
let linkType = "url";
|
|
|
|
let imageExtension = "png";
|
|
|
|
|
|
|
|
if (!link.url) linkType = link.type;
|
2024-01-01 09:37:20 -06:00
|
|
|
else if (contentType?.includes("application/pdf")) linkType = "pdf";
|
2023-12-29 22:59:00 -06:00
|
|
|
else if (contentType?.startsWith("image")) {
|
|
|
|
linkType = "image";
|
2024-01-01 09:37:20 -06:00
|
|
|
if (contentType.includes("image/jpeg")) imageExtension = "jpeg";
|
|
|
|
else if (contentType.includes("image/png")) imageExtension = "png";
|
2023-12-29 22:59:00 -06:00
|
|
|
}
|
|
|
|
|
2024-03-26 00:38:08 -05:00
|
|
|
await prisma.link.update({
|
2023-12-13 16:32:01 -06:00
|
|
|
where: { id: link.id },
|
|
|
|
data: {
|
2023-12-29 22:59:00 -06:00
|
|
|
type: linkType,
|
|
|
|
image:
|
|
|
|
user.archiveAsScreenshot && !link.image?.startsWith("archive")
|
|
|
|
? "pending"
|
2024-03-05 17:28:11 -06:00
|
|
|
: "unavailable",
|
2023-12-29 22:59:00 -06:00
|
|
|
pdf:
|
|
|
|
user.archiveAsPDF && !link.pdf?.startsWith("archive")
|
|
|
|
? "pending"
|
2024-03-05 17:28:11 -06:00
|
|
|
: "unavailable",
|
2023-12-29 22:59:00 -06:00
|
|
|
readable: !link.readable?.startsWith("archive")
|
|
|
|
? "pending"
|
|
|
|
: undefined,
|
2024-03-15 13:41:41 -05:00
|
|
|
singlefile: !link.singlefile?.startsWith("archive")
|
|
|
|
? "pending"
|
|
|
|
: undefined,
|
2023-12-29 22:59:00 -06:00
|
|
|
preview: !link.readable?.startsWith("archive")
|
|
|
|
? "pending"
|
|
|
|
: undefined,
|
|
|
|
lastPreserved: new Date().toISOString(),
|
2023-12-13 16:32:01 -06:00
|
|
|
},
|
|
|
|
});
|
2023-12-23 11:11:47 -06:00
|
|
|
|
2024-03-26 00:38:08 -05:00
|
|
|
// SingleFile
|
2024-06-26 12:54:03 -05:00
|
|
|
// if (
|
|
|
|
// !link.singlefile?.startsWith("archive") &&
|
|
|
|
// !link.singlefile?.startsWith("unavailable") &&
|
|
|
|
// user.archiveAsSinglefile &&
|
|
|
|
// link.url
|
|
|
|
// )
|
|
|
|
// await archiveAsSinglefile(link);
|
2024-03-26 00:38:08 -05:00
|
|
|
|
|
|
|
// send to archive.org
|
|
|
|
if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
|
|
|
|
|
2023-12-29 22:59:00 -06:00
|
|
|
if (linkType === "image" && !link.image?.startsWith("archive")) {
|
|
|
|
await imageHandler(link, imageExtension); // archive image (jpeg/png)
|
|
|
|
return;
|
|
|
|
} else if (linkType === "pdf" && !link.pdf?.startsWith("archive")) {
|
|
|
|
await pdfHandler(link); // archive pdf
|
|
|
|
return;
|
|
|
|
} else if (link.url) {
|
|
|
|
// archive url
|
2023-12-23 11:11:47 -06:00
|
|
|
|
2023-12-29 22:59:00 -06:00
|
|
|
await page.goto(link.url, { waitUntil: "domcontentloaded" });
|
2023-12-24 05:46:08 -06:00
|
|
|
|
2023-12-29 22:59:00 -06:00
|
|
|
const content = await page.content();
|
2023-12-23 11:11:47 -06:00
|
|
|
|
2024-03-26 00:38:08 -05:00
|
|
|
// Readability
|
2024-03-23 23:36:42 -05:00
|
|
|
if (
|
2024-03-26 00:38:08 -05:00
|
|
|
!link.readable?.startsWith("archives") &&
|
|
|
|
!link.readable?.startsWith("unavailable")
|
|
|
|
)
|
|
|
|
await archiveAsReadability(content, link);
|
|
|
|
|
|
|
|
// Preview
|
2024-06-26 12:54:03 -05:00
|
|
|
if (
|
|
|
|
!link.preview?.startsWith("archives") &&
|
|
|
|
!link.preview?.startsWith("unavailable")
|
|
|
|
)
|
|
|
|
await getArchivePreview(link, page);
|
2023-12-23 11:11:47 -06:00
|
|
|
|
2024-06-18 11:27:29 -05:00
|
|
|
// Screenshot/PDF
|
2024-06-26 12:54:03 -05:00
|
|
|
if (
|
|
|
|
(!link.image?.startsWith("archives") &&
|
|
|
|
!link.image?.startsWith("unavailable")) ||
|
|
|
|
(!link.pdf?.startsWith("archives") &&
|
|
|
|
!link.pdf?.startsWith("unavailable"))
|
|
|
|
)
|
|
|
|
await captureScreenshotAndPdf(link, page, user);
|
2023-12-13 16:32:01 -06:00
|
|
|
}
|
2023-12-29 22:59:00 -06:00
|
|
|
})(),
|
|
|
|
timeoutPromise,
|
|
|
|
]);
|
2023-12-13 16:32:01 -06:00
|
|
|
} catch (err) {
|
|
|
|
console.log(err);
|
|
|
|
console.log("Failed Link details:", link);
|
|
|
|
throw err;
|
|
|
|
} finally {
|
|
|
|
const finalLink = await prisma.link.findUnique({
|
|
|
|
where: { id: link.id },
|
|
|
|
});
|
|
|
|
|
|
|
|
if (finalLink)
|
|
|
|
await prisma.link.update({
|
|
|
|
where: { id: link.id },
|
|
|
|
data: {
|
2023-12-23 11:11:47 -06:00
|
|
|
lastPreserved: new Date().toISOString(),
|
2023-12-22 12:13:43 -06:00
|
|
|
readable: !finalLink.readable?.startsWith("archives")
|
2023-12-15 14:47:08 -06:00
|
|
|
? "unavailable"
|
|
|
|
: undefined,
|
2023-12-22 12:13:43 -06:00
|
|
|
image: !finalLink.image?.startsWith("archives")
|
2023-12-15 14:47:08 -06:00
|
|
|
? "unavailable"
|
|
|
|
: undefined,
|
2024-03-15 13:41:41 -05:00
|
|
|
singlefile: !finalLink.singlefile?.startsWith("archives")
|
|
|
|
? "unavailable"
|
|
|
|
: undefined,
|
2023-12-22 12:13:43 -06:00
|
|
|
pdf: !finalLink.pdf?.startsWith("archives")
|
2023-12-15 14:47:08 -06:00
|
|
|
? "unavailable"
|
|
|
|
: undefined,
|
2023-12-23 11:11:47 -06:00
|
|
|
preview: !finalLink.preview?.startsWith("archives")
|
|
|
|
? "unavailable"
|
|
|
|
: undefined,
|
2023-12-13 16:32:01 -06:00
|
|
|
},
|
|
|
|
});
|
2023-12-19 16:20:09 -06:00
|
|
|
else {
|
2024-04-08 18:35:06 -05:00
|
|
|
await removeFiles(link.id, link.collectionId);
|
2023-12-19 16:20:09 -06:00
|
|
|
}
|
2023-12-13 16:32:01 -06:00
|
|
|
|
|
|
|
await browser.close();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const imageHandler = async ({ url, id }: Link, extension: string) => {
|
|
|
|
const image = await fetch(url as string).then((res) => res.blob());
|
|
|
|
|
|
|
|
const buffer = Buffer.from(await image.arrayBuffer());
|
|
|
|
|
|
|
|
const linkExists = await prisma.link.findUnique({
|
|
|
|
where: { id },
|
|
|
|
});
|
|
|
|
|
|
|
|
if (linkExists) {
|
|
|
|
await createFile({
|
|
|
|
data: buffer,
|
|
|
|
filePath: `archives/${linkExists.collectionId}/${id}.${extension}`,
|
|
|
|
});
|
|
|
|
|
|
|
|
await prisma.link.update({
|
|
|
|
where: { id },
|
|
|
|
data: {
|
2023-12-22 12:13:43 -06:00
|
|
|
image: `archives/${linkExists.collectionId}/${id}.${extension}`,
|
2023-12-13 16:32:01 -06:00
|
|
|
},
|
|
|
|
});
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
const pdfHandler = async ({ url, id }: Link) => {
|
|
|
|
const pdf = await fetch(url as string).then((res) => res.blob());
|
|
|
|
|
|
|
|
const buffer = Buffer.from(await pdf.arrayBuffer());
|
|
|
|
|
|
|
|
const linkExists = await prisma.link.findUnique({
|
|
|
|
where: { id },
|
|
|
|
});
|
|
|
|
|
|
|
|
if (linkExists) {
|
|
|
|
await createFile({
|
|
|
|
data: buffer,
|
|
|
|
filePath: `archives/${linkExists.collectionId}/${id}.pdf`,
|
|
|
|
});
|
|
|
|
|
|
|
|
await prisma.link.update({
|
|
|
|
where: { id },
|
|
|
|
data: {
|
2023-12-22 12:13:43 -06:00
|
|
|
pdf: `archives/${linkExists.collectionId}/${id}.pdf`,
|
2023-12-13 16:32:01 -06:00
|
|
|
},
|
|
|
|
});
|
|
|
|
}
|
|
|
|
};
|
2024-06-26 12:54:03 -05:00
|
|
|
|
|
|
|
const getArchivePreview = async (
|
|
|
|
link: LinksAndCollectionAndOwner,
|
|
|
|
page: Page
|
|
|
|
) => {
|
|
|
|
const ogImageUrl = await page.evaluate(() => {
|
|
|
|
const metaTag = document.querySelector('meta[property="og:image"]');
|
|
|
|
return metaTag ? (metaTag as any).content : null;
|
|
|
|
});
|
|
|
|
|
|
|
|
if (ogImageUrl) {
|
|
|
|
console.log("Found og:image URL:", ogImageUrl);
|
|
|
|
|
|
|
|
// Download the image
|
|
|
|
const imageResponse = await page.goto(ogImageUrl);
|
|
|
|
|
|
|
|
// Check if imageResponse is not null
|
|
|
|
if (imageResponse && !link.preview?.startsWith("archive")) {
|
|
|
|
const buffer = await imageResponse.body();
|
|
|
|
generatePreview(buffer, link.collectionId, link.id);
|
|
|
|
}
|
|
|
|
|
|
|
|
await page.goBack();
|
|
|
|
} else if (!link.preview?.startsWith("archive")) {
|
|
|
|
console.log("No og:image found");
|
|
|
|
await page
|
|
|
|
.screenshot({ type: "jpeg", quality: 20 })
|
|
|
|
.then((screenshot) => {
|
|
|
|
return createFile({
|
|
|
|
data: screenshot,
|
|
|
|
filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
|
|
|
|
});
|
|
|
|
})
|
|
|
|
.then(() => {
|
|
|
|
return prisma.link.update({
|
|
|
|
where: { id: link.id },
|
|
|
|
data: {
|
|
|
|
preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
|
|
|
|
},
|
|
|
|
});
|
|
|
|
});
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
const captureScreenshotAndPdf = async (
|
|
|
|
link: LinksAndCollectionAndOwner,
|
|
|
|
page: Page,
|
|
|
|
user: User
|
|
|
|
) => {
|
|
|
|
await page.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30);
|
|
|
|
|
|
|
|
// Check if the user hasn't deleted the link by the time we're done scrolling
|
|
|
|
const linkExists = await prisma.link.findUnique({
|
|
|
|
where: { id: link.id },
|
|
|
|
});
|
|
|
|
if (linkExists) {
|
|
|
|
const processingPromises = [];
|
|
|
|
|
|
|
|
if (user.archiveAsScreenshot && !link.image?.startsWith("archive")) {
|
|
|
|
processingPromises.push(
|
|
|
|
page.screenshot({ fullPage: true, type: "png" }).then((screenshot) => {
|
|
|
|
return createFile({
|
|
|
|
data: screenshot,
|
|
|
|
filePath: `archives/${linkExists.collectionId}/${link.id}.png`,
|
|
|
|
});
|
|
|
|
})
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
const margins = {
|
|
|
|
top: process.env.PDF_MARGIN_TOP || "15px",
|
|
|
|
bottom: process.env.PDF_MARGIN_BOTTOM || "15px",
|
|
|
|
};
|
|
|
|
|
|
|
|
if (user.archiveAsPDF && !link.pdf?.startsWith("archive")) {
|
|
|
|
processingPromises.push(
|
|
|
|
page
|
|
|
|
.pdf({
|
|
|
|
width: "1366px",
|
|
|
|
height: "1931px",
|
|
|
|
printBackground: true,
|
|
|
|
margin: margins,
|
|
|
|
})
|
|
|
|
.then((pdf) => {
|
|
|
|
return createFile({
|
|
|
|
data: pdf,
|
|
|
|
filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`,
|
|
|
|
});
|
|
|
|
})
|
|
|
|
);
|
|
|
|
}
|
|
|
|
await Promise.allSettled(processingPromises);
|
|
|
|
await prisma.link.update({
|
|
|
|
where: { id: link.id },
|
|
|
|
data: {
|
|
|
|
image: user.archiveAsScreenshot
|
|
|
|
? `archives/${linkExists.collectionId}/${link.id}.png`
|
|
|
|
: undefined,
|
|
|
|
pdf: user.archiveAsPDF
|
|
|
|
? `archives/${linkExists.collectionId}/${link.id}.pdf`
|
|
|
|
: undefined,
|
|
|
|
},
|
|
|
|
});
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
const autoScroll = async (AUTOSCROLL_TIMEOUT: number) => {
|
|
|
|
const timeoutPromise = new Promise<void>((resolve) => {
|
|
|
|
setTimeout(() => {
|
|
|
|
resolve();
|
|
|
|
}, AUTOSCROLL_TIMEOUT * 1000);
|
|
|
|
});
|
|
|
|
|
|
|
|
const scrollingPromise = new Promise<void>((resolve) => {
|
|
|
|
let totalHeight = 0;
|
|
|
|
let distance = 100;
|
|
|
|
let scrollDown = setInterval(() => {
|
|
|
|
let scrollHeight = document.body.scrollHeight;
|
|
|
|
window.scrollBy(0, distance);
|
|
|
|
totalHeight += distance;
|
|
|
|
if (totalHeight >= scrollHeight) {
|
|
|
|
clearInterval(scrollDown);
|
|
|
|
window.scroll(0, 0);
|
|
|
|
resolve();
|
|
|
|
}
|
|
|
|
}, 100);
|
|
|
|
});
|
|
|
|
|
|
|
|
await Promise.race([scrollingPromise, timeoutPromise]);
|
|
|
|
};
|