;; Copyright © 2024 Jaidyn Ann ;; ;; This program is free software: you can redistribute it and/or ;; modify it under the terms of the GNU General Public License as ;; published by the Free Software Foundation, either version 3 of ;; the License, or (at your option) any later version. ;; ;; This program is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; ;; You should have received a copy of the GNU General Public License ;; along with this program. If not, see . (defpackage :mirror-img (:use :cl) (:export :mirror-img)) (in-package :mirror-img) (defvar *src-tags* '("audio" "embed" "img" "input" "script" "source" "track" "video")) (defvar *href-tags* '("link")) ;;; Mirror-img ;;; ———————————————————————————————————————— ;; pathname pathname list → string (defun mirror-img (html-file base-dir &optional (tags '("img"))) "Attempt to mirror all remote HREF/SRC URLs of an HTML file’s tags, downloading them to base-dir. The remote URLs will be replaced with the local version, where downloading was successful. Returns a string of the modified page’s HTML." (let* ((dom (lquery:$ (lquery:initialize html-file))) (url→pathname-alist (mirror-linked-urls dom (append *src-tags* *href-tags*) base-dir)) (url→relative-url-alist (mapcar (lambda (pair) (cons (car pair) (namestring (cdr pair)))) url→pathname-alist))) (substitute-urls dom url→relative-url-alist (append *src-tags* *href-tags*)) (aref (lquery:$ dom (serialize)) 0))) ;; lquery-dom list pathname → alist (defun mirror-linked-urls (dom &optional (tags '("link" "img" "script")) base-dir) "Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery DOM, downloading them to either the current-working directory or the base-dir. Returns an associative list containing pairs of remote-URLs to local pathnames. Any unsuccessfuly-mirrored URL is excluded from the alist. For example: ((“https://downloaded.com/a/b.jpg” . #p“a/b.jpg”))" (remove-if #'not (mapcar (lambda (url) (let ((download-path (ensure-directories-exist (mirrored-pathname url base-dir)))) (if (http-fetch url download-path) (cons url download-path)))) (linked-urls dom tags)))) ;; lquery-dom alist list → nil (defun substitute-urls (dom substitution-alist &optional (tags '("img"))) "Replace SRC or HREF attributes of certain tags in an LQuery DOM, based on a substitution associative list. The substitution alist is made up of cons-cells mapping replacee URLs to substition URLs. For instance, with the alist cons-cell (“https://xwx.moe/b.jpg” . “a/b.jpg”), any HREF/SRC values of “https://xwx.moe/b.jpg” will be replaced with “a/b.jpg”, among the given tags." (mapcar (lambda (tag) (lquery:$ dom tag #'(lambda (nodes) (loop for node across nodes do (let ((substitution (cdr (assoc (node-url node) substitution-alist :test #'equal)))) (if substitution (set-node-url node substitution))))))) tags) dom) ;; string pathname → pathname (defun mirrored-pathname (url &optional base-dir) "Given a URL, return the corresponding path we would download it to, were we to mirror it." (let* ((url (quri:uri url)) (path (quri:uri-path url))) (pathname (format nil "~@[~A/~]~A/~A~@[.~A~]" base-dir (quri:uri-host url) (pathname-name path) (pathname-type path))))) ;;; DOM-parsing ;;; ———————————————————————————————————————— ;; lquery-dom list → list (defun linked-urls (dom &optional (tags '("link" "img" "script"))) "Return a list of all URLs in the LQuery DOM contained in the given tags’ HREF and SRC attributes." (flet ((url-list-of-tag (tag) (remove-if #'not (append (coerce (lquery:$ dom tag (attr "src")) 'list) (coerce (lquery:$ dom tag (attr "href")) 'list))))) (loop for tag in tags append (url-list-of-tag tag)))) ;; lquery-node → string (defun node-url (node) "Return the SRC or HREF attribute of an LQuery node, if such a thing exists." (and node (or (lquery-funcs:attr node "src") (lquery-funcs:attr node "href")))) ;; lquery-node → nil (defun set-node-url (node url) "Set the SRC or HREF attribute of an LQuery node (based on which attribute is already set)." (and node (if (lquery-funcs:attr node "src") (lquery-funcs:attr node "src" url)) (if (lquery-funcs:attr node "href") (lquery-funcs:attr node "href" url)))) ;;; Util ;;; ———————————————————————————————————————— ;; string pathname → pathname or nil (defun http-fetch (url path) "Download a URL to a path; if successful, returns the pathname. Otherwise, NIL." (handler-case (and (dexador:fetch (url-encode-uri (quri:uri url)) path) (pathname path)) (file-exists () path) (error nil))) ;; string → string (defun url-encode-uri (uri) "URL-encode the path component of a URI. For example, “https://invalid.tld/dad alive.jpg” → “https://invalid.tld/dad%20alive.jpg”" (let ((uri (quri:uri uri))) (format nil "~@[~A://~]~A~@[:~A~]~A" (quri:uri-scheme uri) (quri:uri-host uri) (quri:uri-port uri) (url-encode-path (quri:uri-path uri))))) ;; string → string (defun url-encode-path (path) "URL-encode a pathname’s components (directory and filenames). For example, “/images!/dad alive.jpg” → “/images%21/dad%20alive.jpg”" (let* ((split (split-sequence:split-sequence #\/ path)) (path-parts (if (not (car split)) '("") split))) (reduce (lambda (a b) (format nil "~A/~A" a b)) (mapcar #'quri:url-encode path-parts))))