Init
This commit is contained in:
commit
8cbb3e338d
|
@ -0,0 +1,157 @@
|
||||||
|
;; Copyright © 2024 Jaidyn Ann <jadedctrl@posteo.at>
|
||||||
|
;;
|
||||||
|
;; This program is free software: you can redistribute it and/or
|
||||||
|
;; modify it under the terms of the GNU General Public License as
|
||||||
|
;; published by the Free Software Foundation, either version 3 of
|
||||||
|
;; the License, or (at your option) any later version.
|
||||||
|
;;
|
||||||
|
;; This program is distributed in the hope that it will be useful,
|
||||||
|
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
;; GNU General Public License for more details.
|
||||||
|
;;
|
||||||
|
;; You should have received a copy of the GNU General Public License
|
||||||
|
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
(defvar *src-tags* '("audio" "embed" "img" "input" "script" "source" "track" "video"))
|
||||||
|
(defvar *href-tags* '("link"))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Mirror-img
|
||||||
|
;;; ————————————————————————————————————————
|
||||||
|
(defun mirror-remote-urls (html-file base-dir &optional (tags '("img")))
|
||||||
|
"Attempt to mirror all remote HREF/SRC URLs of an HTML file’s tags,
|
||||||
|
downloading them to base-dir. The remote URLs will be replaced with the
|
||||||
|
local version, where downloading was successful.
|
||||||
|
Returns a string of the modified page’s HTML."
|
||||||
|
(let* ((dom (lquery:$ (lquery:initialize html-file)))
|
||||||
|
(url→pathname-alist
|
||||||
|
(mirror-linked-urls dom (append *src-tags* *href-tags*) base-dir))
|
||||||
|
(url→relative-url-alist
|
||||||
|
(mapcar (lambda (pair) (cons (car pair) (namestring (cdr pair))))
|
||||||
|
url→pathname-alist)))
|
||||||
|
(substitute-urls dom
|
||||||
|
url→relative-url-alist
|
||||||
|
(append *src-tags* *href-tags*))
|
||||||
|
(aref (lquery:$ dom (serialize)) 0)))
|
||||||
|
|
||||||
|
|
||||||
|
(defun mirror-linked-urls (dom &optional (tags '("link" "img" "script")) base-dir)
|
||||||
|
"Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery
|
||||||
|
DOM, downloading them to either the current-working directory or the base-dir.
|
||||||
|
Returns an associative list containing pairs of remote-URLs to local pathnames.
|
||||||
|
Any unsuccessfuly-mirrored URL is excluded from the alist.
|
||||||
|
For example:
|
||||||
|
((“https://downloaded.com/a/b.jpg” . #p“a/b.jpg”))"
|
||||||
|
(remove-if
|
||||||
|
#'not
|
||||||
|
(mapcar (lambda (url)
|
||||||
|
(let ((download-path
|
||||||
|
(ensure-directories-exist
|
||||||
|
(mirrored-pathname url base-dir))))
|
||||||
|
(if (http-fetch url download-path)
|
||||||
|
(cons url download-path))))
|
||||||
|
(linked-urls dom tags))))
|
||||||
|
|
||||||
|
|
||||||
|
(defun substitute-urls (dom substitution-alist &optional (tags '("img")))
|
||||||
|
"Replace SRC or HREF attributes of certain tags in an LQuery DOM, based on a
|
||||||
|
substitution associative list. The substitution alist is made up of cons-cells
|
||||||
|
mapping replacee URLs to substition URLs.
|
||||||
|
For instance, with the alist cons-cell (“https://xwx.moe/b.jpg” . “a/b.jpg”),
|
||||||
|
any HREF/SRC values of “https://xwx.moe/b.jpg” will be replaced with “a/b.jpg”,
|
||||||
|
among the given tags."
|
||||||
|
(mapcar
|
||||||
|
(lambda (tag)
|
||||||
|
(lquery:$ dom tag
|
||||||
|
#'(lambda (nodes)
|
||||||
|
(loop for node across nodes do
|
||||||
|
(let ((substitution
|
||||||
|
(cdr (assoc (node-url node)
|
||||||
|
substitution-alist :test #'equal))))
|
||||||
|
(if substitution
|
||||||
|
(set-node-url node substitution)))))))
|
||||||
|
tags)
|
||||||
|
dom)
|
||||||
|
|
||||||
|
|
||||||
|
(defun mirrored-pathname (url &optional base-dir)
|
||||||
|
"Given a URL, return the corresponding path we would download it to, were we
|
||||||
|
to mirror it."
|
||||||
|
(let* ((url (quri:uri url))
|
||||||
|
(path (quri:uri-path url)))
|
||||||
|
(pathname
|
||||||
|
(format nil "~@[~A/~]~A/~A~@[.~A~]"
|
||||||
|
base-dir
|
||||||
|
(quri:uri-host url)
|
||||||
|
(pathname-name path)
|
||||||
|
(pathname-type path)))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; DOM-parsing
|
||||||
|
;;; ————————————————————————————————————————
|
||||||
|
(defun linked-urls (dom &optional (tags '("link" "img" "script")))
|
||||||
|
"Return a list of all URLs in the LQuery DOM contained in the given tags’
|
||||||
|
HREF and SRC attributes."
|
||||||
|
(flet ((url-list-of-tag (tag)
|
||||||
|
(remove-if
|
||||||
|
#'not
|
||||||
|
(append
|
||||||
|
(coerce (lquery:$ dom tag (attr "src")) 'list)
|
||||||
|
(coerce (lquery:$ dom tag (attr "href")) 'list)))))
|
||||||
|
(loop for tag in tags
|
||||||
|
append (url-list-of-tag tag))))
|
||||||
|
|
||||||
|
|
||||||
|
(defun node-url (node)
|
||||||
|
"Return the SRC or HREF attribute of an LQuery node, if such a thing exists."
|
||||||
|
(and node
|
||||||
|
(or (lquery-funcs:attr node "src")
|
||||||
|
(lquery-funcs:attr node "href"))))
|
||||||
|
|
||||||
|
|
||||||
|
(defun set-node-url (node url)
|
||||||
|
"Set the SRC or HREF attribute of an LQuery node (based on which attribute is
|
||||||
|
already set)."
|
||||||
|
(and node
|
||||||
|
(if (lquery-funcs:attr node "src")
|
||||||
|
(lquery-funcs:attr node "src" url))
|
||||||
|
(if (lquery-funcs:attr node "href")
|
||||||
|
(lquery-funcs:attr node "href" url))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Util
|
||||||
|
;;; ————————————————————————————————————————
|
||||||
|
(defun http-fetch (url path)
|
||||||
|
"Download a URL to a path; if successful, returns the pathname. Otherwise, NIL."
|
||||||
|
(handler-case
|
||||||
|
(and (dexador:fetch (url-encode-uri (quri:uri url)) path)
|
||||||
|
(pathname path))
|
||||||
|
(file-exists ()
|
||||||
|
path)
|
||||||
|
(error nil)))
|
||||||
|
|
||||||
|
|
||||||
|
(defun url-encode-uri (uri)
|
||||||
|
"URL-encode the path component of a URI. For example,
|
||||||
|
“https://invalid.tld/dad alive.jpg” → “https://invalid.tld/dad%20alive.jpg”"
|
||||||
|
(let ((uri (quri:uri uri)))
|
||||||
|
(format nil "~@[~A://~]~A~@[:~A~]~A"
|
||||||
|
(quri:uri-scheme uri)
|
||||||
|
(quri:uri-host uri)
|
||||||
|
(quri:uri-port uri)
|
||||||
|
(url-encode-path (quri:uri-path uri)))))
|
||||||
|
|
||||||
|
|
||||||
|
(defun url-encode-path (path)
|
||||||
|
"URL-encode a pathname’s components (directory and filenames). For example,
|
||||||
|
“/images!/dad alive.jpg” → “/images%21/dad%20alive.jpg”"
|
||||||
|
(let* ((split (split-sequence:split-sequence #\/ path))
|
||||||
|
(path-parts (if (not (car split)) '("") split)))
|
||||||
|
(reduce
|
||||||
|
(lambda (a b)
|
||||||
|
(format nil "~A/~A" a b))
|
||||||
|
(mapcar #'quri:url-encode path-parts))))
|
Ŝarĝante…
Reference in New Issue