code improvements
This commit is contained in:
parent
797ddc4b73
commit
2b8f7d4be2
|
@ -18,6 +18,7 @@ import {
|
|||
import PreservedFormatRow from "@/components/PreserverdFormatRow";
|
||||
import useAccountStore from "@/store/account";
|
||||
import getPublicUserData from "@/lib/client/getPublicUserData";
|
||||
import { BeatLoader } from "react-spinners";
|
||||
|
||||
type Props = {
|
||||
onClose: Function;
|
||||
|
@ -87,6 +88,15 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
|
|||
);
|
||||
};
|
||||
|
||||
const atLeastOneFormatAvailable = () => {
|
||||
return (
|
||||
screenshotAvailable(link) ||
|
||||
pdfAvailable(link) ||
|
||||
readabilityAvailable(link) ||
|
||||
singlefileAvailable(link)
|
||||
);
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
(async () => {
|
||||
const data = await getLink(link.id as number, isPublic);
|
||||
|
@ -143,11 +153,10 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
|
|||
|
||||
<div className="divider mb-2 mt-1"></div>
|
||||
|
||||
{isReady() &&
|
||||
(screenshotAvailable(link) ||
|
||||
{screenshotAvailable(link) ||
|
||||
pdfAvailable(link) ||
|
||||
readabilityAvailable(link) ||
|
||||
singlefileAvailable(link)) ? (
|
||||
singlefileAvailable(link) ? (
|
||||
<p className="mb-3">
|
||||
The following formats are available for this link:
|
||||
</p>
|
||||
|
@ -156,8 +165,6 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
|
|||
)}
|
||||
|
||||
<div className={`flex flex-col gap-3`}>
|
||||
{isReady() ? (
|
||||
<>
|
||||
{screenshotAvailable(link) ? (
|
||||
<PreservedFormatRow
|
||||
name={"Screenshot"}
|
||||
|
@ -182,6 +189,16 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
|
|||
/>
|
||||
) : undefined}
|
||||
|
||||
{singlefileAvailable(link) ? (
|
||||
<PreservedFormatRow
|
||||
name={"SingleFile (Full Copy)"}
|
||||
icon={"bi-filetype-html"}
|
||||
format={ArchivedFormat.singlefile}
|
||||
activeLink={link}
|
||||
downloadable={true}
|
||||
/>
|
||||
) : undefined}
|
||||
|
||||
{readabilityAvailable(link) ? (
|
||||
<PreservedFormatRow
|
||||
name={"Readable"}
|
||||
|
@ -191,21 +208,14 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
|
|||
/>
|
||||
) : undefined}
|
||||
|
||||
{singlefileAvailable(link) ? (
|
||||
<PreservedFormatRow
|
||||
name={"Singlefile"}
|
||||
icon={"bi-filetype-html"}
|
||||
format={ArchivedFormat.singlefile}
|
||||
activeLink={link}
|
||||
downloadable={true}
|
||||
{!isReady() && !atLeastOneFormatAvailable() ? (
|
||||
<div className={`w-full h-full flex flex-col justify-center p-10`}>
|
||||
<BeatLoader
|
||||
color="oklch(var(--p))"
|
||||
className="mx-auto mb-3"
|
||||
size={30}
|
||||
/>
|
||||
) : undefined}
|
||||
</>
|
||||
) : (
|
||||
<div
|
||||
className={`w-full h-full flex flex-col justify-center p-10 skeleton bg-base-200`}
|
||||
>
|
||||
<i className="bi-stack drop-shadow text-primary text-8xl mx-auto mb-5"></i>
|
||||
|
||||
<p className="text-center text-2xl">
|
||||
Link preservation is in the queue
|
||||
</p>
|
||||
|
@ -213,7 +223,22 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
|
|||
Please check back later to see the result
|
||||
</p>
|
||||
</div>
|
||||
)}
|
||||
) : !isReady() && atLeastOneFormatAvailable() ? (
|
||||
<div className={`w-full h-full flex flex-col justify-center p-5`}>
|
||||
<BeatLoader
|
||||
color="oklch(var(--p))"
|
||||
className="mx-auto mb-3"
|
||||
size={20}
|
||||
/>
|
||||
|
||||
<p className="text-center">
|
||||
There are more preserved formats in the queue
|
||||
</p>
|
||||
<p className="text-center text-sm">
|
||||
Please check back later to see the result
|
||||
</p>
|
||||
</div>
|
||||
) : undefined}
|
||||
|
||||
<div
|
||||
className={`flex flex-col sm:flex-row gap-3 items-center justify-center ${
|
||||
|
|
|
@ -1,18 +1,14 @@
|
|||
import { LaunchOptions, chromium, devices } from "playwright";
|
||||
import { prisma } from "./db";
|
||||
import createFile from "./storage/createFile";
|
||||
import sendToWayback from "./sendToWayback";
|
||||
import { Readability } from "@mozilla/readability";
|
||||
import { JSDOM } from "jsdom";
|
||||
import DOMPurify from "dompurify";
|
||||
import sendToWayback from "./preservationScheme/sendToWayback";
|
||||
import { Collection, Link, User } from "@prisma/client";
|
||||
import validateUrlSize from "./validateUrlSize";
|
||||
import removeFile from "./storage/removeFile";
|
||||
import Jimp from "jimp";
|
||||
import { execSync } from "child_process";
|
||||
import axios from "axios";
|
||||
import { Agent } from "http";
|
||||
import createFolder from "./storage/createFolder";
|
||||
import archiveAsSinglefile from "./preservationScheme/archiveAsSinglefile";
|
||||
import archiveAsReadability from "./preservationScheme/archiveAsReadablility";
|
||||
|
||||
type LinksAndCollectionAndOwner = Link & {
|
||||
collection: Collection & {
|
||||
|
@ -23,25 +19,6 @@ type LinksAndCollectionAndOwner = Link & {
|
|||
const BROWSER_TIMEOUT = Number(process.env.BROWSER_TIMEOUT) || 5;
|
||||
|
||||
export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
||||
// allow user to configure a proxy
|
||||
let browserOptions: LaunchOptions = {};
|
||||
if (process.env.PROXY) {
|
||||
browserOptions.proxy = {
|
||||
server: process.env.PROXY,
|
||||
bypass: process.env.PROXY_BYPASS,
|
||||
username: process.env.PROXY_USERNAME,
|
||||
password: process.env.PROXY_PASSWORD,
|
||||
};
|
||||
}
|
||||
|
||||
const browser = await chromium.launch(browserOptions);
|
||||
const context = await browser.newContext({
|
||||
...devices["Desktop Chrome"],
|
||||
ignoreHTTPSErrors: process.env.IGNORE_HTTPS_ERRORS === "true",
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
const timeoutPromise = new Promise((_, reject) => {
|
||||
setTimeout(
|
||||
() =>
|
||||
|
@ -54,9 +31,24 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
|||
);
|
||||
});
|
||||
|
||||
// allow user to configure a proxy
|
||||
let browserOptions: LaunchOptions = {};
|
||||
if (process.env.PROXY) {
|
||||
browserOptions.proxy = {
|
||||
server: process.env.PROXY,
|
||||
bypass: process.env.PROXY_BYPASS,
|
||||
username: process.env.PROXY_USERNAME,
|
||||
password: process.env.PROXY_PASSWORD,
|
||||
};
|
||||
}
|
||||
|
||||
const browser = await chromium.launch(browserOptions);
|
||||
|
||||
try {
|
||||
await Promise.race([
|
||||
(async () => {
|
||||
const user = link.collection?.owner;
|
||||
|
||||
const validatedUrl = link.url
|
||||
? await validateUrlSize(link.url)
|
||||
: undefined;
|
||||
|
@ -76,12 +68,7 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
|||
else if (contentType.includes("image/png")) imageExtension = "png";
|
||||
}
|
||||
|
||||
const user = link.collection?.owner;
|
||||
|
||||
// send to archive.org
|
||||
if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
|
||||
|
||||
const targetLink = await prisma.link.update({
|
||||
await prisma.link.update({
|
||||
where: { id: link.id },
|
||||
data: {
|
||||
type: linkType,
|
||||
|
@ -106,6 +93,18 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
|||
},
|
||||
});
|
||||
|
||||
// SingleFile
|
||||
if (
|
||||
!link.singlefile?.startsWith("archive") &&
|
||||
!link.singlefile?.startsWith("unavailable") &&
|
||||
user.archiveAsSinglefile &&
|
||||
link.url
|
||||
)
|
||||
await archiveAsSinglefile(link);
|
||||
|
||||
// send to archive.org
|
||||
if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
|
||||
|
||||
if (linkType === "image" && !link.image?.startsWith("archive")) {
|
||||
await imageHandler(link, imageExtension); // archive image (jpeg/png)
|
||||
return;
|
||||
|
@ -115,100 +114,34 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
|||
} else if (link.url) {
|
||||
// archive url
|
||||
|
||||
const context = await browser.newContext({
|
||||
...devices["Desktop Chrome"],
|
||||
ignoreHTTPSErrors: process.env.IGNORE_HTTPS_ERRORS === "true",
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
await page.goto(link.url, { waitUntil: "domcontentloaded" });
|
||||
|
||||
const content = await page.content();
|
||||
|
||||
// Singlefile
|
||||
if (
|
||||
user.archiveAsSinglefile &&
|
||||
!link.singlefile?.startsWith("archive")
|
||||
) {
|
||||
let command = process.env.SINGLEFILE_ARCHIVE_COMMAND;
|
||||
let httpApi = process.env.SINGLEFILE_ARCHIVE_HTTP_API;
|
||||
if (command) {
|
||||
if (command.includes("{{URL}}")) {
|
||||
try {
|
||||
let html = execSync(command.replace("{{URL}}", link.url), {
|
||||
timeout: 60000,
|
||||
maxBuffer: 1024 * 1024 * 100,
|
||||
});
|
||||
await createFile({
|
||||
data: html,
|
||||
filePath: `archives/${targetLink.collectionId}/${link.id}.html`,
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(
|
||||
"Error running SINGLEFILE_ARCHIVE_COMMAND:",
|
||||
err
|
||||
);
|
||||
}
|
||||
} else {
|
||||
console.error(
|
||||
"Invalid SINGLEFILE_ARCHIVE_COMMAND. Missing {{URL}}"
|
||||
);
|
||||
}
|
||||
} else if (httpApi) {
|
||||
try {
|
||||
let html = await axios.post(
|
||||
httpApi,
|
||||
{ url: link.url },
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
},
|
||||
httpAgent: new Agent({ keepAlive: false }),
|
||||
}
|
||||
);
|
||||
await createFile({
|
||||
data: html.data,
|
||||
filePath: `archives/${targetLink.collectionId}/${link.id}.html`,
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(
|
||||
"Error fetching Singlefile using SINGLEFILE_ARCHIVE_HTTP_API:",
|
||||
err
|
||||
);
|
||||
}
|
||||
} else {
|
||||
console.error(
|
||||
"No SINGLEFILE_ARCHIVE_COMMAND or SINGLEFILE_ARCHIVE_HTTP_API defined."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Readability
|
||||
const window = new JSDOM("").window;
|
||||
const purify = DOMPurify(window);
|
||||
const cleanedUpContent = purify.sanitize(content);
|
||||
const dom = new JSDOM(cleanedUpContent, { url: link.url || "" });
|
||||
const article = new Readability(dom.window.document).parse();
|
||||
const articleText = article?.textContent
|
||||
.replace(/ +(?= )/g, "") // strip out multiple spaces
|
||||
.replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks
|
||||
if (
|
||||
articleText &&
|
||||
articleText !== "" &&
|
||||
!link.readable?.startsWith("archive")
|
||||
) {
|
||||
await createFile({
|
||||
data: JSON.stringify(article),
|
||||
filePath: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
|
||||
});
|
||||
|
||||
await prisma.link.update({
|
||||
where: { id: link.id },
|
||||
data: {
|
||||
readable: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
|
||||
textContent: articleText,
|
||||
},
|
||||
});
|
||||
}
|
||||
!link.readable?.startsWith("archives") &&
|
||||
!link.readable?.startsWith("unavailable")
|
||||
)
|
||||
await archiveAsReadability(content, link);
|
||||
|
||||
// Preview
|
||||
|
||||
if (
|
||||
!link.preview?.startsWith("archives") &&
|
||||
!link.preview?.startsWith("unavailable")
|
||||
) {
|
||||
const ogImageUrl = await page.evaluate(() => {
|
||||
const metaTag = document.querySelector('meta[property="og:image"]');
|
||||
const metaTag = document.querySelector(
|
||||
'meta[property="og:image"]'
|
||||
);
|
||||
return metaTag ? (metaTag as any).content : null;
|
||||
});
|
||||
|
||||
|
@ -276,7 +209,14 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
|||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
(!link.image?.startsWith("archives") &&
|
||||
!link.image?.startsWith("unavailable")) ||
|
||||
(!link.pdf?.startsWith("archives") &&
|
||||
!link.pdf?.startsWith("unavailable"))
|
||||
) {
|
||||
// Screenshot/PDF
|
||||
await page.evaluate(
|
||||
autoScroll,
|
||||
|
@ -341,6 +281,7 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
|||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
})(),
|
||||
timeoutPromise,
|
||||
]);
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
import { Readability } from "@mozilla/readability";
|
||||
import { JSDOM } from "jsdom";
|
||||
import DOMPurify from "dompurify";
|
||||
import { prisma } from "../db";
|
||||
import createFile from "../storage/createFile";
|
||||
import { Link } from "@prisma/client";
|
||||
|
||||
const archiveAsReadablility = async (content: string, link: Link) => {
|
||||
const window = new JSDOM("").window;
|
||||
const purify = DOMPurify(window);
|
||||
const cleanedUpContent = purify.sanitize(content);
|
||||
const dom = new JSDOM(cleanedUpContent, { url: link.url || "" });
|
||||
const article = new Readability(dom.window.document).parse();
|
||||
const articleText = article?.textContent
|
||||
.replace(/ +(?= )/g, "") // strip out multiple spaces
|
||||
.replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks
|
||||
if (
|
||||
articleText &&
|
||||
articleText !== "" &&
|
||||
!link.readable?.startsWith("archive")
|
||||
) {
|
||||
const collectionId = (
|
||||
await prisma.link.findUnique({
|
||||
where: { id: link.id },
|
||||
select: { collectionId: true },
|
||||
})
|
||||
)?.collectionId;
|
||||
|
||||
await createFile({
|
||||
data: JSON.stringify(article),
|
||||
filePath: `archives/${collectionId}/${link.id}_readability.json`,
|
||||
});
|
||||
|
||||
await prisma.link.update({
|
||||
where: { id: link.id },
|
||||
data: {
|
||||
readable: `archives/${collectionId}/${link.id}_readability.json`,
|
||||
textContent: articleText,
|
||||
},
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
export default archiveAsReadablility;
|
|
@ -0,0 +1,111 @@
|
|||
import { execSync } from "child_process";
|
||||
import createFile from "../storage/createFile";
|
||||
import axios from "axios";
|
||||
import { Agent } from "http";
|
||||
import { prisma } from "../db";
|
||||
import { Link } from "@prisma/client";
|
||||
|
||||
const archiveAsSinglefile = async (link: Link) => {
|
||||
if (!link.url) return;
|
||||
|
||||
let command = process.env.SINGLEFILE_ARCHIVE_COMMAND;
|
||||
let httpApi = process.env.SINGLEFILE_ARCHIVE_HTTP_API;
|
||||
if (command) {
|
||||
if (command.includes("{{URL}}")) {
|
||||
try {
|
||||
let html = execSync(command.replace("{{URL}}", link.url), {
|
||||
timeout: 120000,
|
||||
maxBuffer: 1024 * 1024 * 30,
|
||||
});
|
||||
|
||||
if (!html.length) {
|
||||
console.error(
|
||||
"Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer"
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const collectionId = (
|
||||
await prisma.link.findUnique({
|
||||
where: { id: link.id },
|
||||
select: { collectionId: true },
|
||||
})
|
||||
)?.collectionId;
|
||||
|
||||
if (!collectionId) {
|
||||
console.error(
|
||||
"Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found"
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
await createFile({
|
||||
data: html,
|
||||
filePath: `archives/${collectionId}/${link.id}.html`,
|
||||
}).then(async () => {
|
||||
await prisma.link.update({
|
||||
where: { id: link.id },
|
||||
data: {
|
||||
singlefile: `archives/${collectionId}/${link.id}.html`,
|
||||
},
|
||||
});
|
||||
});
|
||||
} catch (err) {
|
||||
console.error("Error running SINGLEFILE_ARCHIVE_COMMAND:", err);
|
||||
}
|
||||
} else {
|
||||
console.error("Invalid SINGLEFILE_ARCHIVE_COMMAND. Missing {{URL}}");
|
||||
}
|
||||
} else if (httpApi) {
|
||||
try {
|
||||
let html = await axios.post(
|
||||
httpApi,
|
||||
{ url: link.url },
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
},
|
||||
httpAgent: new Agent({ keepAlive: false }),
|
||||
}
|
||||
);
|
||||
|
||||
if (!html.data.length) {
|
||||
console.error("Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer");
|
||||
return;
|
||||
}
|
||||
|
||||
const collectionId = (
|
||||
await prisma.link.findUnique({
|
||||
where: { id: link.id },
|
||||
select: { collectionId: true },
|
||||
})
|
||||
)?.collectionId;
|
||||
|
||||
if (!collectionId) {
|
||||
console.error(
|
||||
"Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found"
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
await createFile({
|
||||
data: html.data,
|
||||
filePath: `archives/${collectionId}/${link.id}.html`,
|
||||
}).then(async () => {
|
||||
await prisma.link.update({
|
||||
where: { id: link.id },
|
||||
data: {
|
||||
singlefile: `archives/${collectionId}/${link.id}.html`,
|
||||
},
|
||||
});
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(
|
||||
"Error fetching Singlefile using SINGLEFILE_ARCHIVE_HTTP_API:",
|
||||
err
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
export default archiveAsSinglefile;
|
Ŝarĝante…
Reference in New Issue