163 lines
6.1 KiB
Common Lisp
163 lines
6.1 KiB
Common Lisp
;; Copyright © 2024 Jaidyn Ann <jadedctrl@posteo.at>
|
||
;;
|
||
;; This program is free software: you can redistribute it and/or
|
||
;; modify it under the terms of the GNU General Public License as
|
||
;; published by the Free Software Foundation, either version 3 of
|
||
;; the License, or (at your option) any later version.
|
||
;;
|
||
;; This program is distributed in the hope that it will be useful,
|
||
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
;; GNU General Public License for more details.
|
||
;;
|
||
;; You should have received a copy of the GNU General Public License
|
||
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
|
||
(defpackage :mirror-img
|
||
(:use :cl)
|
||
(:export :mirror-img))
|
||
|
||
(in-package :mirror-img)
|
||
|
||
(defvar *src-tags* '("audio" "embed" "img" "input" "script" "source" "track" "video"))
|
||
(defvar *href-tags* '("link"))
|
||
|
||
|
||
;;; Mirror-img
|
||
;;; ————————————————————————————————————————
|
||
(defun mirror-img (html-file base-dir &optional (tags '("img")))
|
||
"Attempt to mirror all remote HREF/SRC URLs of an HTML file’s tags,
|
||
downloading them to base-dir. The remote URLs will be replaced with the
|
||
local version, where downloading was successful.
|
||
Returns a string of the modified page’s HTML."
|
||
(let* ((dom (lquery:$ (lquery:initialize html-file)))
|
||
(url→pathname-alist
|
||
(mirror-linked-urls dom (append *src-tags* *href-tags*) base-dir))
|
||
(url→relative-url-alist
|
||
(mapcar (lambda (pair) (cons (car pair) (namestring (cdr pair))))
|
||
url→pathname-alist)))
|
||
(substitute-urls dom
|
||
url→relative-url-alist
|
||
(append *src-tags* *href-tags*))
|
||
(aref (lquery:$ dom (serialize)) 0)))
|
||
|
||
|
||
(defun mirror-linked-urls (dom &optional (tags '("link" "img" "script")) base-dir)
|
||
"Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery
|
||
DOM, downloading them to either the current-working directory or the base-dir.
|
||
Returns an associative list containing pairs of remote-URLs to local pathnames.
|
||
Any unsuccessfuly-mirrored URL is excluded from the alist.
|
||
For example:
|
||
((“https://downloaded.com/a/b.jpg” . #p“a/b.jpg”))"
|
||
(remove-if
|
||
#'not
|
||
(mapcar (lambda (url)
|
||
(let ((download-path
|
||
(ensure-directories-exist
|
||
(mirrored-pathname url base-dir))))
|
||
(if (http-fetch url download-path)
|
||
(cons url download-path))))
|
||
(linked-urls dom tags))))
|
||
|
||
|
||
(defun substitute-urls (dom substitution-alist &optional (tags '("img")))
|
||
"Replace SRC or HREF attributes of certain tags in an LQuery DOM, based on a
|
||
substitution associative list. The substitution alist is made up of cons-cells
|
||
mapping replacee URLs to substition URLs.
|
||
For instance, with the alist cons-cell (“https://xwx.moe/b.jpg” . “a/b.jpg”),
|
||
any HREF/SRC values of “https://xwx.moe/b.jpg” will be replaced with “a/b.jpg”,
|
||
among the given tags."
|
||
(mapcar
|
||
(lambda (tag)
|
||
(lquery:$ dom tag
|
||
#'(lambda (nodes)
|
||
(loop for node across nodes do
|
||
(let ((substitution
|
||
(cdr (assoc (node-url node)
|
||
substitution-alist :test #'equal))))
|
||
(if substitution
|
||
(set-node-url node substitution)))))))
|
||
tags)
|
||
dom)
|
||
|
||
|
||
(defun mirrored-pathname (url &optional base-dir)
|
||
"Given a URL, return the corresponding path we would download it to, were we
|
||
to mirror it."
|
||
(let* ((url (quri:uri url))
|
||
(path (quri:uri-path url)))
|
||
(pathname
|
||
(format nil "~@[~A/~]~A/~A~@[.~A~]"
|
||
base-dir
|
||
(quri:uri-host url)
|
||
(pathname-name path)
|
||
(pathname-type path)))))
|
||
|
||
|
||
|
||
;;; DOM-parsing
|
||
;;; ————————————————————————————————————————
|
||
(defun linked-urls (dom &optional (tags '("link" "img" "script")))
|
||
"Return a list of all URLs in the LQuery DOM contained in the given tags’
|
||
HREF and SRC attributes."
|
||
(flet ((url-list-of-tag (tag)
|
||
(remove-if
|
||
#'not
|
||
(append
|
||
(coerce (lquery:$ dom tag (attr "src")) 'list)
|
||
(coerce (lquery:$ dom tag (attr "href")) 'list)))))
|
||
(loop for tag in tags
|
||
append (url-list-of-tag tag))))
|
||
|
||
|
||
(defun node-url (node)
|
||
"Return the SRC or HREF attribute of an LQuery node, if such a thing exists."
|
||
(and node
|
||
(or (lquery-funcs:attr node "src")
|
||
(lquery-funcs:attr node "href"))))
|
||
|
||
|
||
(defun set-node-url (node url)
|
||
"Set the SRC or HREF attribute of an LQuery node (based on which attribute is
|
||
already set)."
|
||
(and node
|
||
(if (lquery-funcs:attr node "src")
|
||
(lquery-funcs:attr node "src" url))
|
||
(if (lquery-funcs:attr node "href")
|
||
(lquery-funcs:attr node "href" url))))
|
||
|
||
|
||
|
||
;;; Util
|
||
;;; ————————————————————————————————————————
|
||
(defun http-fetch (url path)
|
||
"Download a URL to a path; if successful, returns the pathname. Otherwise, NIL."
|
||
(handler-case
|
||
(and (dexador:fetch (url-encode-uri (quri:uri url)) path)
|
||
(pathname path))
|
||
(file-exists ()
|
||
path)
|
||
(error nil)))
|
||
|
||
|
||
(defun url-encode-uri (uri)
|
||
"URL-encode the path component of a URI. For example,
|
||
“https://invalid.tld/dad alive.jpg” → “https://invalid.tld/dad%20alive.jpg”"
|
||
(let ((uri (quri:uri uri)))
|
||
(format nil "~@[~A://~]~A~@[:~A~]~A"
|
||
(quri:uri-scheme uri)
|
||
(quri:uri-host uri)
|
||
(quri:uri-port uri)
|
||
(url-encode-path (quri:uri-path uri)))))
|
||
|
||
|
||
(defun url-encode-path (path)
|
||
"URL-encode a pathname’s components (directory and filenames). For example,
|
||
“/images!/dad alive.jpg” → “/images%21/dad%20alive.jpg”"
|
||
(let* ((split (split-sequence:split-sequence #\/ path))
|
||
(path-parts (if (not (car split)) '("") split)))
|
||
(reduce
|
||
(lambda (a b)
|
||
(format nil "~A/~A" a b))
|
||
(mapcar #'quri:url-encode path-parts))))
|