finalized archiveHandler/background worker logic
This commit is contained in:
parent
a001f70b9d
commit
b74ff01ce6
|
@ -15,7 +15,7 @@ NEXT_PUBLIC_DISABLE_REGISTRATION=
|
||||||
NEXT_PUBLIC_CREDENTIALS_ENABLED=
|
NEXT_PUBLIC_CREDENTIALS_ENABLED=
|
||||||
DISABLE_NEW_SSO_USERS=
|
DISABLE_NEW_SSO_USERS=
|
||||||
RE_ARCHIVE_LIMIT=
|
RE_ARCHIVE_LIMIT=
|
||||||
NEXT_PUBLIC_MAX_UPLOAD_SIZE=
|
NEXT_PUBLIC_MAX_FILE_SIZE=
|
||||||
|
|
||||||
# AWS S3 Settings
|
# AWS S3 Settings
|
||||||
SPACES_KEY=
|
SPACES_KEY=
|
||||||
|
|
|
@ -174,7 +174,7 @@ export default function UploadFileModal({ onClose }: Props) {
|
||||||
/>
|
/>
|
||||||
</label>
|
</label>
|
||||||
<p className="text-xs font-semibold mt-2">
|
<p className="text-xs font-semibold mt-2">
|
||||||
PDF, PNG, JPG (Up to {process.env.NEXT_PUBLIC_MAX_UPLOAD_SIZE || 30}
|
PDF, PNG, JPG (Up to {process.env.NEXT_PUBLIC_MAX_FILE_SIZE || 30}
|
||||||
MB)
|
MB)
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -0,0 +1,272 @@
|
||||||
|
import { chromium, devices } from "playwright";
|
||||||
|
import { prisma } from "./db";
|
||||||
|
import createFile from "./storage/createFile";
|
||||||
|
import sendToWayback from "./sendToWayback";
|
||||||
|
import { Readability } from "@mozilla/readability";
|
||||||
|
import { JSDOM } from "jsdom";
|
||||||
|
import DOMPurify from "dompurify";
|
||||||
|
import { Collection, Link, User } from "@prisma/client";
|
||||||
|
import validateUrlSize from "./validateUrlSize";
|
||||||
|
import {
|
||||||
|
pdfAvailable,
|
||||||
|
readabilityAvailable,
|
||||||
|
screenshotAvailable,
|
||||||
|
} from "../shared/getArchiveValidity";
|
||||||
|
|
||||||
|
type LinksAndCollectionAndOwner = Link & {
|
||||||
|
collection: Collection & {
|
||||||
|
owner: User;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
||||||
|
const browser = await chromium.launch();
|
||||||
|
const context = await browser.newContext(devices["Desktop Chrome"]);
|
||||||
|
const page = await context.newPage();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const validatedUrl = link.url ? await validateUrlSize(link.url) : undefined;
|
||||||
|
|
||||||
|
if (validatedUrl === null) throw "File is too large to be stored.";
|
||||||
|
|
||||||
|
const contentType = validatedUrl?.get("content-type");
|
||||||
|
let linkType = "url";
|
||||||
|
let imageExtension = "png";
|
||||||
|
|
||||||
|
if (!link.url) linkType = link.type;
|
||||||
|
else if (contentType === "application/pdf") linkType = "pdf";
|
||||||
|
else if (contentType?.startsWith("image")) {
|
||||||
|
linkType = "image";
|
||||||
|
if (contentType === "image/jpeg") imageExtension = "jpeg";
|
||||||
|
else if (contentType === "image/png") imageExtension = "png";
|
||||||
|
}
|
||||||
|
|
||||||
|
const user = link.collection?.owner;
|
||||||
|
|
||||||
|
// send to archive.org
|
||||||
|
if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
|
||||||
|
|
||||||
|
const targetLink = await prisma.link.update({
|
||||||
|
where: { id: link.id },
|
||||||
|
data: {
|
||||||
|
type: linkType,
|
||||||
|
screenshotPath: user.archiveAsScreenshot ? "pending" : undefined,
|
||||||
|
pdfPath: user.archiveAsPDF ? "pending" : undefined,
|
||||||
|
readabilityPath: "pending",
|
||||||
|
lastPreserved: new Date().toISOString(),
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (linkType === "image") {
|
||||||
|
await imageHandler(link, imageExtension); // archive image (jpeg/png)
|
||||||
|
return;
|
||||||
|
} else if (linkType === "pdf") {
|
||||||
|
await pdfHandler(link); // archive pdf
|
||||||
|
return;
|
||||||
|
} else if (user.archiveAsPDF || user.archiveAsScreenshot) {
|
||||||
|
// archive url
|
||||||
|
link.url &&
|
||||||
|
(await page.goto(link.url, { waitUntil: "domcontentloaded" }));
|
||||||
|
|
||||||
|
const content = await page.content();
|
||||||
|
|
||||||
|
// TODO Webarchive
|
||||||
|
// const session = await page.context().newCDPSession(page);
|
||||||
|
// const doc = await session.send("Page.captureSnapshot", {
|
||||||
|
// format: "mhtml",
|
||||||
|
// });
|
||||||
|
// const saveDocLocally = (doc: any) => {
|
||||||
|
// console.log(doc);
|
||||||
|
// return createFile({
|
||||||
|
// data: doc,
|
||||||
|
// filePath: `archives/${targetLink.collectionId}/${link.id}.mhtml`,
|
||||||
|
// });
|
||||||
|
// };
|
||||||
|
// saveDocLocally(doc.data);
|
||||||
|
|
||||||
|
// Readability
|
||||||
|
const window = new JSDOM("").window;
|
||||||
|
const purify = DOMPurify(window);
|
||||||
|
const cleanedUpContent = purify.sanitize(content);
|
||||||
|
const dom = new JSDOM(cleanedUpContent, { url: link.url || "" });
|
||||||
|
const article = new Readability(dom.window.document).parse();
|
||||||
|
const articleText = article?.textContent
|
||||||
|
.replace(/ +(?= )/g, "") // strip out multiple spaces
|
||||||
|
.replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks
|
||||||
|
if (articleText && articleText !== "") {
|
||||||
|
await createFile({
|
||||||
|
data: JSON.stringify(article),
|
||||||
|
filePath: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
|
||||||
|
});
|
||||||
|
|
||||||
|
await prisma.link.update({
|
||||||
|
where: { id: link.id },
|
||||||
|
data: {
|
||||||
|
readabilityPath: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
|
||||||
|
textContent: articleText,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Screenshot/PDF
|
||||||
|
await page.evaluate(
|
||||||
|
autoScroll,
|
||||||
|
Number(process.env.AUTOSCROLL_TIMEOUT) || 30
|
||||||
|
);
|
||||||
|
// Check if the user hasn't deleted the link by the time we're done scrolling
|
||||||
|
const linkExists = await prisma.link.findUnique({
|
||||||
|
where: { id: link.id },
|
||||||
|
});
|
||||||
|
if (linkExists) {
|
||||||
|
const processingPromises = [];
|
||||||
|
|
||||||
|
if (user.archiveAsScreenshot) {
|
||||||
|
processingPromises.push(
|
||||||
|
page.screenshot({ fullPage: true }).then((screenshot) => {
|
||||||
|
return createFile({
|
||||||
|
data: screenshot,
|
||||||
|
filePath: `archives/${linkExists.collectionId}/${link.id}.png`,
|
||||||
|
});
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (user.archiveAsPDF) {
|
||||||
|
processingPromises.push(
|
||||||
|
page
|
||||||
|
.pdf({
|
||||||
|
width: "1366px",
|
||||||
|
height: "1931px",
|
||||||
|
printBackground: true,
|
||||||
|
margin: { top: "15px", bottom: "15px" },
|
||||||
|
})
|
||||||
|
.then((pdf) => {
|
||||||
|
return createFile({
|
||||||
|
data: pdf,
|
||||||
|
filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`,
|
||||||
|
});
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
await Promise.allSettled(processingPromises);
|
||||||
|
await prisma.link.update({
|
||||||
|
where: { id: link.id },
|
||||||
|
data: {
|
||||||
|
screenshotPath: user.archiveAsScreenshot
|
||||||
|
? `archives/${linkExists.collectionId}/${link.id}.png`
|
||||||
|
: undefined,
|
||||||
|
pdfPath: user.archiveAsPDF
|
||||||
|
? `archives/${linkExists.collectionId}/${link.id}.pdf`
|
||||||
|
: undefined,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.log(err);
|
||||||
|
console.log("Failed Link details:", link);
|
||||||
|
throw err;
|
||||||
|
} finally {
|
||||||
|
const finalLink = await prisma.link.findUnique({
|
||||||
|
where: { id: link.id },
|
||||||
|
});
|
||||||
|
|
||||||
|
if (finalLink)
|
||||||
|
await prisma.link.update({
|
||||||
|
where: { id: link.id },
|
||||||
|
data: {
|
||||||
|
readabilityPath:
|
||||||
|
!finalLink.textContent ||
|
||||||
|
finalLink.textContent === "" ||
|
||||||
|
!readabilityAvailable(finalLink) ||
|
||||||
|
finalLink.type !== "url"
|
||||||
|
? "unavailable"
|
||||||
|
: undefined,
|
||||||
|
screenshotPath:
|
||||||
|
!screenshotAvailable(finalLink) ||
|
||||||
|
(finalLink.type !== "url" && finalLink.type !== "pdf")
|
||||||
|
? "unavailable"
|
||||||
|
: undefined,
|
||||||
|
pdfPath:
|
||||||
|
!pdfAvailable(finalLink) ||
|
||||||
|
(finalLink.type !== "url" && finalLink.type !== "image")
|
||||||
|
? "unavailable"
|
||||||
|
: undefined,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const autoScroll = async (AUTOSCROLL_TIMEOUT: number) => {
|
||||||
|
const timeoutPromise = new Promise<void>((_, reject) => {
|
||||||
|
setTimeout(() => {
|
||||||
|
reject(new Error(`Webpage was too long to be archived.`));
|
||||||
|
}, AUTOSCROLL_TIMEOUT * 1000);
|
||||||
|
});
|
||||||
|
|
||||||
|
const scrollingPromise = new Promise<void>((resolve) => {
|
||||||
|
let totalHeight = 0;
|
||||||
|
let distance = 100;
|
||||||
|
let scrollDown = setInterval(() => {
|
||||||
|
let scrollHeight = document.body.scrollHeight;
|
||||||
|
window.scrollBy(0, distance);
|
||||||
|
totalHeight += distance;
|
||||||
|
if (totalHeight >= scrollHeight) {
|
||||||
|
clearInterval(scrollDown);
|
||||||
|
window.scroll(0, 0);
|
||||||
|
resolve();
|
||||||
|
}
|
||||||
|
}, 100);
|
||||||
|
});
|
||||||
|
|
||||||
|
await Promise.race([scrollingPromise, timeoutPromise]);
|
||||||
|
};
|
||||||
|
|
||||||
|
const imageHandler = async ({ url, id }: Link, extension: string) => {
|
||||||
|
const image = await fetch(url as string).then((res) => res.blob());
|
||||||
|
|
||||||
|
const buffer = Buffer.from(await image.arrayBuffer());
|
||||||
|
|
||||||
|
const linkExists = await prisma.link.findUnique({
|
||||||
|
where: { id },
|
||||||
|
});
|
||||||
|
|
||||||
|
if (linkExists) {
|
||||||
|
await createFile({
|
||||||
|
data: buffer,
|
||||||
|
filePath: `archives/${linkExists.collectionId}/${id}.${extension}`,
|
||||||
|
});
|
||||||
|
|
||||||
|
await prisma.link.update({
|
||||||
|
where: { id },
|
||||||
|
data: {
|
||||||
|
screenshotPath: `archives/${linkExists.collectionId}/${id}.${extension}`,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const pdfHandler = async ({ url, id }: Link) => {
|
||||||
|
const pdf = await fetch(url as string).then((res) => res.blob());
|
||||||
|
|
||||||
|
const buffer = Buffer.from(await pdf.arrayBuffer());
|
||||||
|
|
||||||
|
const linkExists = await prisma.link.findUnique({
|
||||||
|
where: { id },
|
||||||
|
});
|
||||||
|
|
||||||
|
if (linkExists) {
|
||||||
|
await createFile({
|
||||||
|
data: buffer,
|
||||||
|
filePath: `archives/${linkExists.collectionId}/${id}.pdf`,
|
||||||
|
});
|
||||||
|
|
||||||
|
await prisma.link.update({
|
||||||
|
where: { id },
|
||||||
|
data: {
|
||||||
|
pdfPath: `archives/${linkExists.collectionId}/${id}.pdf`,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
|
@ -1,13 +1,10 @@
|
||||||
import { prisma } from "@/lib/api/db";
|
import { prisma } from "@/lib/api/db";
|
||||||
import { LinkIncludingShortenedCollectionAndTags } from "@/types/global";
|
import { LinkIncludingShortenedCollectionAndTags } from "@/types/global";
|
||||||
import getTitle from "@/lib/shared/getTitle";
|
import getTitle from "@/lib/shared/getTitle";
|
||||||
import urlHandler from "@/lib/api/urlHandler";
|
|
||||||
import { UsersAndCollections } from "@prisma/client";
|
import { UsersAndCollections } from "@prisma/client";
|
||||||
import getPermission from "@/lib/api/getPermission";
|
import getPermission from "@/lib/api/getPermission";
|
||||||
import createFolder from "@/lib/api/storage/createFolder";
|
import createFolder from "@/lib/api/storage/createFolder";
|
||||||
import pdfHandler from "../../pdfHandler";
|
|
||||||
import validateUrlSize from "../../validateUrlSize";
|
import validateUrlSize from "../../validateUrlSize";
|
||||||
import imageHandler from "../../imageHandler";
|
|
||||||
|
|
||||||
export default async function postLink(
|
export default async function postLink(
|
||||||
link: LinkIncludingShortenedCollectionAndTags,
|
link: LinkIncludingShortenedCollectionAndTags,
|
||||||
|
@ -113,37 +110,5 @@ export default async function postLink(
|
||||||
|
|
||||||
createFolder({ filePath: `archives/${newLink.collectionId}` });
|
createFolder({ filePath: `archives/${newLink.collectionId}` });
|
||||||
|
|
||||||
newLink.url && linkType === "url"
|
|
||||||
? urlHandler(newLink.id, newLink.url, userId)
|
|
||||||
: undefined;
|
|
||||||
|
|
||||||
newLink.url && linkType === "pdf"
|
|
||||||
? pdfHandler(newLink.id, newLink.url)
|
|
||||||
: undefined;
|
|
||||||
|
|
||||||
newLink.url && linkType === "image"
|
|
||||||
? imageHandler(newLink.id, newLink.url, imageExtension)
|
|
||||||
: undefined;
|
|
||||||
|
|
||||||
!newLink.url && linkType === "pdf"
|
|
||||||
? await prisma.link.update({
|
|
||||||
where: { id: newLink.id },
|
|
||||||
data: {
|
|
||||||
pdfPath: "pending",
|
|
||||||
lastPreserved: new Date().toISOString(),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
: undefined;
|
|
||||||
|
|
||||||
!newLink.url && linkType === "image"
|
|
||||||
? await prisma.link.update({
|
|
||||||
where: { id: newLink.id },
|
|
||||||
data: {
|
|
||||||
screenshotPath: "pending",
|
|
||||||
lastPreserved: new Date().toISOString(),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
: undefined;
|
|
||||||
|
|
||||||
return { response: newLink, status: 200 };
|
return { response: newLink, status: 200 };
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,9 +9,9 @@ export default async function imageHandler(
|
||||||
extension: string,
|
extension: string,
|
||||||
file?: string
|
file?: string
|
||||||
) {
|
) {
|
||||||
const pdf = await fetch(url as string).then((res) => res.blob());
|
const image = await fetch(url as string).then((res) => res.blob());
|
||||||
|
|
||||||
const buffer = Buffer.from(await pdf.arrayBuffer());
|
const buffer = Buffer.from(await image.arrayBuffer());
|
||||||
|
|
||||||
const linkExists = await prisma.link.findUnique({
|
const linkExists = await prisma.link.findUnique({
|
||||||
where: { id: linkId },
|
where: { id: linkId },
|
||||||
|
|
|
@ -1,172 +0,0 @@
|
||||||
import { chromium, devices } from "playwright";
|
|
||||||
import { prisma } from "@/lib/api/db";
|
|
||||||
import createFile from "@/lib/api/storage/createFile";
|
|
||||||
import sendToWayback from "./sendToWayback";
|
|
||||||
import { Readability } from "@mozilla/readability";
|
|
||||||
import { JSDOM } from "jsdom";
|
|
||||||
import DOMPurify from "dompurify";
|
|
||||||
|
|
||||||
export default async function urlHandler(
|
|
||||||
linkId: number,
|
|
||||||
url: string,
|
|
||||||
userId: number
|
|
||||||
) {
|
|
||||||
const user = await prisma.user.findUnique({ where: { id: userId } });
|
|
||||||
|
|
||||||
const targetLink = await prisma.link.update({
|
|
||||||
where: { id: linkId },
|
|
||||||
data: {
|
|
||||||
screenshotPath: user?.archiveAsScreenshot ? "pending" : null,
|
|
||||||
pdfPath: user?.archiveAsPDF ? "pending" : null,
|
|
||||||
readabilityPath: "pending",
|
|
||||||
lastPreserved: new Date().toISOString(),
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
// archive.org
|
|
||||||
|
|
||||||
if (user?.archiveAsWaybackMachine) sendToWayback(url);
|
|
||||||
|
|
||||||
if (user?.archiveAsPDF || user?.archiveAsScreenshot) {
|
|
||||||
const browser = await chromium.launch();
|
|
||||||
const context = await browser.newContext(devices["Desktop Chrome"]);
|
|
||||||
const page = await context.newPage();
|
|
||||||
|
|
||||||
try {
|
|
||||||
await page.goto(url, { waitUntil: "domcontentloaded" });
|
|
||||||
|
|
||||||
const content = await page.content();
|
|
||||||
|
|
||||||
// TODO
|
|
||||||
// const session = await page.context().newCDPSession(page);
|
|
||||||
|
|
||||||
// const doc = await session.send("Page.captureSnapshot", {
|
|
||||||
// format: "mhtml",
|
|
||||||
// });
|
|
||||||
|
|
||||||
// const saveDocLocally = (doc: any) => {
|
|
||||||
// console.log(doc);
|
|
||||||
// return createFile({
|
|
||||||
// data: doc,
|
|
||||||
// filePath: `archives/${targetLink.collectionId}/${linkId}.mhtml`,
|
|
||||||
// });
|
|
||||||
// };
|
|
||||||
|
|
||||||
// saveDocLocally(doc.data);
|
|
||||||
|
|
||||||
// Readability
|
|
||||||
|
|
||||||
const window = new JSDOM("").window;
|
|
||||||
const purify = DOMPurify(window);
|
|
||||||
const cleanedUpContent = purify.sanitize(content);
|
|
||||||
const dom = new JSDOM(cleanedUpContent, { url: url });
|
|
||||||
const article = new Readability(dom.window.document).parse();
|
|
||||||
|
|
||||||
const articleText = article?.textContent
|
|
||||||
.replace(/ +(?= )/g, "") // strip out multiple spaces
|
|
||||||
.replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks
|
|
||||||
|
|
||||||
await createFile({
|
|
||||||
data: JSON.stringify(article),
|
|
||||||
filePath: `archives/${targetLink.collectionId}/${linkId}_readability.json`,
|
|
||||||
});
|
|
||||||
|
|
||||||
await prisma.link.update({
|
|
||||||
where: { id: linkId },
|
|
||||||
data: {
|
|
||||||
readabilityPath: `archives/${targetLink.collectionId}/${linkId}_readability.json`,
|
|
||||||
textContent: articleText,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
// Screenshot/PDF
|
|
||||||
|
|
||||||
let faulty = false;
|
|
||||||
|
|
||||||
await page
|
|
||||||
.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30)
|
|
||||||
.catch((err) => {
|
|
||||||
console.log(err);
|
|
||||||
faulty = true;
|
|
||||||
});
|
|
||||||
|
|
||||||
const linkExists = await prisma.link.findUnique({
|
|
||||||
where: { id: linkId },
|
|
||||||
});
|
|
||||||
|
|
||||||
if (linkExists && !faulty) {
|
|
||||||
if (user.archiveAsScreenshot) {
|
|
||||||
const screenshot = await page.screenshot({ fullPage: true });
|
|
||||||
await createFile({
|
|
||||||
data: screenshot,
|
|
||||||
filePath: `archives/${linkExists.collectionId}/${linkId}.png`,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (user.archiveAsPDF) {
|
|
||||||
const pdf = await page.pdf({
|
|
||||||
width: "1366px",
|
|
||||||
height: "1931px",
|
|
||||||
printBackground: true,
|
|
||||||
margin: { top: "15px", bottom: "15px" },
|
|
||||||
});
|
|
||||||
|
|
||||||
await createFile({
|
|
||||||
data: pdf,
|
|
||||||
filePath: `archives/${linkExists.collectionId}/${linkId}.pdf`,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
await prisma.link.update({
|
|
||||||
where: { id: linkId },
|
|
||||||
data: {
|
|
||||||
screenshotPath: user.archiveAsScreenshot
|
|
||||||
? `archives/${linkExists.collectionId}/${linkId}.png`
|
|
||||||
: null,
|
|
||||||
pdfPath: user.archiveAsPDF
|
|
||||||
? `archives/${linkExists.collectionId}/${linkId}.pdf`
|
|
||||||
: null,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
} else if (faulty) {
|
|
||||||
await prisma.link.update({
|
|
||||||
where: { id: linkId },
|
|
||||||
data: {
|
|
||||||
screenshotPath: null,
|
|
||||||
pdfPath: null,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
console.log(err);
|
|
||||||
throw err;
|
|
||||||
} finally {
|
|
||||||
await browser.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const autoScroll = async (AUTOSCROLL_TIMEOUT: number) => {
|
|
||||||
const timeoutPromise = new Promise<void>((_, reject) => {
|
|
||||||
setTimeout(() => {
|
|
||||||
reject(new Error(`Webpage was too long to be archived.`));
|
|
||||||
}, AUTOSCROLL_TIMEOUT * 1000);
|
|
||||||
});
|
|
||||||
|
|
||||||
const scrollingPromise = new Promise<void>((resolve) => {
|
|
||||||
let totalHeight = 0;
|
|
||||||
let distance = 100;
|
|
||||||
let scrollDown = setInterval(() => {
|
|
||||||
let scrollHeight = document.body.scrollHeight;
|
|
||||||
window.scrollBy(0, distance);
|
|
||||||
totalHeight += distance;
|
|
||||||
if (totalHeight >= scrollHeight) {
|
|
||||||
clearInterval(scrollDown);
|
|
||||||
window.scroll(0, 0);
|
|
||||||
resolve();
|
|
||||||
}
|
|
||||||
}, 100);
|
|
||||||
});
|
|
||||||
|
|
||||||
await Promise.race([scrollingPromise, timeoutPromise]);
|
|
||||||
};
|
|
|
@ -4,7 +4,8 @@ export default async function validateUrlSize(url: string) {
|
||||||
|
|
||||||
const totalSizeMB =
|
const totalSizeMB =
|
||||||
Number(response.headers.get("content-length")) / Math.pow(1024, 2);
|
Number(response.headers.get("content-length")) / Math.pow(1024, 2);
|
||||||
if (totalSizeMB > 50) return null;
|
if (totalSizeMB > (Number(process.env.NEXT_PUBLIC_MAX_FILE_SIZE) || 30))
|
||||||
|
return null;
|
||||||
else return response.headers;
|
else return response.headers;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.log(err);
|
console.log(err);
|
||||||
|
|
|
@ -1,11 +1,9 @@
|
||||||
import { Link } from "@prisma/client";
|
|
||||||
|
|
||||||
export function screenshotAvailable(link: any) {
|
export function screenshotAvailable(link: any) {
|
||||||
return (
|
return (
|
||||||
link &&
|
link &&
|
||||||
link.screenshotPath &&
|
link.screenshotPath &&
|
||||||
link.screenshotPath !== "pending" &&
|
link.screenshotPath !== "pending" &&
|
||||||
link.screenshotPath !== "failed"
|
link.screenshotPath !== "unavailable"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,7 +12,7 @@ export function pdfAvailable(link: any) {
|
||||||
link &&
|
link &&
|
||||||
link.pdfPath &&
|
link.pdfPath &&
|
||||||
link.pdfPath !== "pending" &&
|
link.pdfPath !== "pending" &&
|
||||||
link.pdfPath !== "failed"
|
link.pdfPath !== "unavailable"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,6 +21,6 @@ export function readabilityAvailable(link: any) {
|
||||||
link &&
|
link &&
|
||||||
link.readabilityPath &&
|
link.readabilityPath &&
|
||||||
link.readabilityPath !== "pending" &&
|
link.readabilityPath !== "pending" &&
|
||||||
link.readabilityPath !== "failed"
|
link.readabilityPath !== "unavailable"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,7 +81,7 @@ export default async function Index(req: NextApiRequest, res: NextApiResponse) {
|
||||||
|
|
||||||
// // await uploadHandler(linkId, )
|
// // await uploadHandler(linkId, )
|
||||||
|
|
||||||
// const MAX_UPLOAD_SIZE = Number(process.env.NEXT_PUBLIC_MAX_UPLOAD_SIZE);
|
// const MAX_UPLOAD_SIZE = Number(process.env.NEXT_PUBLIC_MAX_FILE_SIZE);
|
||||||
|
|
||||||
// const form = formidable({
|
// const form = formidable({
|
||||||
// maxFields: 1,
|
// maxFields: 1,
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import type { NextApiRequest, NextApiResponse } from "next";
|
import type { NextApiRequest, NextApiResponse } from "next";
|
||||||
import urlHandler from "@/lib/api/urlHandler";
|
import urlHandler from "@/lib/api/archiveHandler";
|
||||||
import { prisma } from "@/lib/api/db";
|
import { prisma } from "@/lib/api/db";
|
||||||
import verifyUser from "@/lib/api/verifyUser";
|
import verifyUser from "@/lib/api/verifyUser";
|
||||||
import isValidUrl from "@/lib/shared/isValidUrl";
|
import isValidUrl from "@/lib/shared/isValidUrl";
|
||||||
|
|
|
@ -1,176 +0,0 @@
|
||||||
import { chromium, devices } from "playwright";
|
|
||||||
import { prisma } from "../../lib/api/db";
|
|
||||||
import createFile from "../../lib/api/storage/createFile";
|
|
||||||
import sendToWayback from "../../lib/api/sendToWayback";
|
|
||||||
import { Readability } from "@mozilla/readability";
|
|
||||||
import { JSDOM } from "jsdom";
|
|
||||||
import DOMPurify from "dompurify";
|
|
||||||
import { Collection, Link, User } from "@prisma/client";
|
|
||||||
|
|
||||||
type LinksAndCollectionAndOwner = Link & {
|
|
||||||
collection: Collection & {
|
|
||||||
owner: User;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
export default async function urlHandler(link: LinksAndCollectionAndOwner) {
|
|
||||||
const user = link.collection?.owner;
|
|
||||||
|
|
||||||
const targetLink = await prisma.link.update({
|
|
||||||
where: { id: link.id },
|
|
||||||
data: {
|
|
||||||
screenshotPath: user.archiveAsScreenshot ? "pending" : null,
|
|
||||||
pdfPath: user.archiveAsPDF ? "pending" : null,
|
|
||||||
readabilityPath: "pending",
|
|
||||||
lastPreserved: new Date().toISOString(),
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
// archive.org
|
|
||||||
|
|
||||||
if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
|
|
||||||
|
|
||||||
if (user.archiveAsPDF || user.archiveAsScreenshot) {
|
|
||||||
const browser = await chromium.launch();
|
|
||||||
const context = await browser.newContext(devices["Desktop Chrome"]);
|
|
||||||
const page = await context.newPage();
|
|
||||||
|
|
||||||
try {
|
|
||||||
link.url &&
|
|
||||||
(await page.goto(link.url, { waitUntil: "domcontentloaded" }));
|
|
||||||
|
|
||||||
const content = await page.content();
|
|
||||||
|
|
||||||
// TODO
|
|
||||||
// const session = await page.context().newCDPSession(page);
|
|
||||||
// const doc = await session.send("Page.captureSnapshot", {
|
|
||||||
// format: "mhtml",
|
|
||||||
// });
|
|
||||||
// const saveDocLocally = (doc: any) => {
|
|
||||||
// console.log(doc);
|
|
||||||
// return createFile({
|
|
||||||
// data: doc,
|
|
||||||
// filePath: `archives/${targetLink.collectionId}/${link.id}.mhtml`,
|
|
||||||
// });
|
|
||||||
// };
|
|
||||||
// saveDocLocally(doc.data);
|
|
||||||
|
|
||||||
// Readability
|
|
||||||
|
|
||||||
const window = new JSDOM("").window;
|
|
||||||
const purify = DOMPurify(window);
|
|
||||||
const cleanedUpContent = purify.sanitize(content);
|
|
||||||
const dom = new JSDOM(cleanedUpContent, { url: link.url || "" });
|
|
||||||
const article = new Readability(dom.window.document).parse();
|
|
||||||
|
|
||||||
const articleText = article?.textContent
|
|
||||||
.replace(/ +(?= )/g, "") // strip out multiple spaces
|
|
||||||
.replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks
|
|
||||||
|
|
||||||
await createFile({
|
|
||||||
data: JSON.stringify(article),
|
|
||||||
filePath: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
|
|
||||||
});
|
|
||||||
|
|
||||||
await prisma.link.update({
|
|
||||||
where: { id: link.id },
|
|
||||||
data: {
|
|
||||||
readabilityPath: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
|
|
||||||
textContent: articleText,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
// Screenshot/PDF
|
|
||||||
|
|
||||||
let faulty = false;
|
|
||||||
await page
|
|
||||||
.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30)
|
|
||||||
.catch((e) => (faulty = true));
|
|
||||||
|
|
||||||
const linkExists = await prisma.link.findUnique({
|
|
||||||
where: { id: link.id },
|
|
||||||
});
|
|
||||||
|
|
||||||
if (linkExists && !faulty) {
|
|
||||||
const processingPromises = [];
|
|
||||||
|
|
||||||
if (user.archiveAsScreenshot) {
|
|
||||||
const screenshot = await page.screenshot({ fullPage: true });
|
|
||||||
processingPromises.push(
|
|
||||||
createFile({
|
|
||||||
data: screenshot,
|
|
||||||
filePath: `archives/${linkExists.collectionId}/${link.id}.png`,
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (user.archiveAsPDF) {
|
|
||||||
const pdf = await page.pdf({
|
|
||||||
width: "1366px",
|
|
||||||
height: "1931px",
|
|
||||||
printBackground: true,
|
|
||||||
margin: { top: "15px", bottom: "15px" },
|
|
||||||
});
|
|
||||||
processingPromises.push(
|
|
||||||
createFile({
|
|
||||||
data: pdf,
|
|
||||||
filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`,
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
await Promise.allSettled(processingPromises);
|
|
||||||
|
|
||||||
await prisma.link.update({
|
|
||||||
where: { id: link.id },
|
|
||||||
data: {
|
|
||||||
screenshotPath: user.archiveAsScreenshot
|
|
||||||
? `archives/${linkExists.collectionId}/${link.id}.png`
|
|
||||||
: null,
|
|
||||||
pdfPath: user.archiveAsPDF
|
|
||||||
? `archives/${linkExists.collectionId}/${link.id}.pdf`
|
|
||||||
: null,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
} else if (faulty) {
|
|
||||||
await prisma.link.update({
|
|
||||||
where: { id: link.id },
|
|
||||||
data: {
|
|
||||||
screenshotPath: null,
|
|
||||||
pdfPath: null,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
console.log(err);
|
|
||||||
throw err;
|
|
||||||
} finally {
|
|
||||||
await browser.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const autoScroll = async (AUTOSCROLL_TIMEOUT: number) => {
|
|
||||||
const timeoutPromise = new Promise<void>((_, reject) => {
|
|
||||||
setTimeout(() => {
|
|
||||||
reject(new Error(`Webpage was too long to be archived.`));
|
|
||||||
}, AUTOSCROLL_TIMEOUT * 1000);
|
|
||||||
});
|
|
||||||
|
|
||||||
const scrollingPromise = new Promise<void>((resolve) => {
|
|
||||||
let totalHeight = 0;
|
|
||||||
let distance = 100;
|
|
||||||
let scrollDown = setInterval(() => {
|
|
||||||
let scrollHeight = document.body.scrollHeight;
|
|
||||||
window.scrollBy(0, distance);
|
|
||||||
totalHeight += distance;
|
|
||||||
if (totalHeight >= scrollHeight) {
|
|
||||||
clearInterval(scrollDown);
|
|
||||||
window.scroll(0, 0);
|
|
||||||
resolve();
|
|
||||||
}
|
|
||||||
}, 100);
|
|
||||||
});
|
|
||||||
|
|
||||||
await Promise.race([scrollingPromise, timeoutPromise]);
|
|
||||||
};
|
|
|
@ -1,11 +1,9 @@
|
||||||
import { Collection, Link, User } from "@prisma/client";
|
import { Collection, Link, User } from "@prisma/client";
|
||||||
import { prisma } from "../lib/api/db";
|
import { prisma } from "../lib/api/db";
|
||||||
import urlHandler from "./lib/urlHandler";
|
import archiveHandler from "../lib/api/archiveHandler";
|
||||||
|
|
||||||
const args = process.argv.slice(2).join(" ");
|
const args = process.argv.slice(2).join(" ");
|
||||||
|
|
||||||
console.log(process.env.NEXTAUTH_URL);
|
|
||||||
|
|
||||||
const archiveTakeCount = Number(process.env.ARCHIVE_TAKE_COUNT || "") || 5;
|
const archiveTakeCount = Number(process.env.ARCHIVE_TAKE_COUNT || "") || 5;
|
||||||
|
|
||||||
type LinksAndCollectionAndOwner = Link & {
|
type LinksAndCollectionAndOwner = Link & {
|
||||||
|
@ -136,7 +134,7 @@ async function processBatch() {
|
||||||
`Processing link ${link.url} for user ${link.collection.ownerId}`
|
`Processing link ${link.url} for user ${link.collection.ownerId}`
|
||||||
);
|
);
|
||||||
|
|
||||||
await urlHandler(link);
|
await archiveHandler(link);
|
||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
"\x1b[34m%s\x1b[0m",
|
"\x1b[34m%s\x1b[0m",
|
||||||
|
@ -152,9 +150,12 @@ async function processBatch() {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Process each link in the batch concurrently
|
// Process each link in the batch concurrently
|
||||||
const processingPromises = [...linksOldToNew, ...linksNewToOld].map((e) =>
|
const processingPromises = [...linksOldToNew, ...linksNewToOld]
|
||||||
archiveLink(e)
|
// Make sure we don't process the same link twice
|
||||||
);
|
.filter((value, index, self) => {
|
||||||
|
return self.findIndex((item) => item.id === value.id) === index;
|
||||||
|
})
|
||||||
|
.map((e) => archiveLink(e));
|
||||||
|
|
||||||
await Promise.allSettled(processingPromises);
|
await Promise.allSettled(processingPromises);
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,7 +9,7 @@ declare global {
|
||||||
STORAGE_FOLDER?: string;
|
STORAGE_FOLDER?: string;
|
||||||
AUTOSCROLL_TIMEOUT?: string;
|
AUTOSCROLL_TIMEOUT?: string;
|
||||||
RE_ARCHIVE_LIMIT?: string;
|
RE_ARCHIVE_LIMIT?: string;
|
||||||
NEXT_PUBLIC_MAX_UPLOAD_SIZE?: string;
|
NEXT_PUBLIC_MAX_FILE_SIZE?: string;
|
||||||
|
|
||||||
SPACES_KEY?: string;
|
SPACES_KEY?: string;
|
||||||
SPACES_SECRET?: string;
|
SPACES_SECRET?: string;
|
||||||
|
|
Ŝarĝante…
Reference in New Issue