Init
This commit is contained in:
commit
8cbb3e338d
|
@ -0,0 +1,157 @@
|
|||
;; Copyright © 2024 Jaidyn Ann <jadedctrl@posteo.at>
|
||||
;;
|
||||
;; This program is free software: you can redistribute it and/or
|
||||
;; modify it under the terms of the GNU General Public License as
|
||||
;; published by the Free Software Foundation, either version 3 of
|
||||
;; the License, or (at your option) any later version.
|
||||
;;
|
||||
;; This program is distributed in the hope that it will be useful,
|
||||
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;; GNU General Public License for more details.
|
||||
;;
|
||||
;; You should have received a copy of the GNU General Public License
|
||||
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
(defvar *src-tags* '("audio" "embed" "img" "input" "script" "source" "track" "video"))
|
||||
(defvar *href-tags* '("link"))
|
||||
|
||||
|
||||
|
||||
;;; Mirror-img
|
||||
;;; ————————————————————————————————————————
|
||||
(defun mirror-remote-urls (html-file base-dir &optional (tags '("img")))
|
||||
"Attempt to mirror all remote HREF/SRC URLs of an HTML file’s tags,
|
||||
downloading them to base-dir. The remote URLs will be replaced with the
|
||||
local version, where downloading was successful.
|
||||
Returns a string of the modified page’s HTML."
|
||||
(let* ((dom (lquery:$ (lquery:initialize html-file)))
|
||||
(url→pathname-alist
|
||||
(mirror-linked-urls dom (append *src-tags* *href-tags*) base-dir))
|
||||
(url→relative-url-alist
|
||||
(mapcar (lambda (pair) (cons (car pair) (namestring (cdr pair))))
|
||||
url→pathname-alist)))
|
||||
(substitute-urls dom
|
||||
url→relative-url-alist
|
||||
(append *src-tags* *href-tags*))
|
||||
(aref (lquery:$ dom (serialize)) 0)))
|
||||
|
||||
|
||||
(defun mirror-linked-urls (dom &optional (tags '("link" "img" "script")) base-dir)
|
||||
"Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery
|
||||
DOM, downloading them to either the current-working directory or the base-dir.
|
||||
Returns an associative list containing pairs of remote-URLs to local pathnames.
|
||||
Any unsuccessfuly-mirrored URL is excluded from the alist.
|
||||
For example:
|
||||
((“https://downloaded.com/a/b.jpg” . #p“a/b.jpg”))"
|
||||
(remove-if
|
||||
#'not
|
||||
(mapcar (lambda (url)
|
||||
(let ((download-path
|
||||
(ensure-directories-exist
|
||||
(mirrored-pathname url base-dir))))
|
||||
(if (http-fetch url download-path)
|
||||
(cons url download-path))))
|
||||
(linked-urls dom tags))))
|
||||
|
||||
|
||||
(defun substitute-urls (dom substitution-alist &optional (tags '("img")))
|
||||
"Replace SRC or HREF attributes of certain tags in an LQuery DOM, based on a
|
||||
substitution associative list. The substitution alist is made up of cons-cells
|
||||
mapping replacee URLs to substition URLs.
|
||||
For instance, with the alist cons-cell (“https://xwx.moe/b.jpg” . “a/b.jpg”),
|
||||
any HREF/SRC values of “https://xwx.moe/b.jpg” will be replaced with “a/b.jpg”,
|
||||
among the given tags."
|
||||
(mapcar
|
||||
(lambda (tag)
|
||||
(lquery:$ dom tag
|
||||
#'(lambda (nodes)
|
||||
(loop for node across nodes do
|
||||
(let ((substitution
|
||||
(cdr (assoc (node-url node)
|
||||
substitution-alist :test #'equal))))
|
||||
(if substitution
|
||||
(set-node-url node substitution)))))))
|
||||
tags)
|
||||
dom)
|
||||
|
||||
|
||||
(defun mirrored-pathname (url &optional base-dir)
|
||||
"Given a URL, return the corresponding path we would download it to, were we
|
||||
to mirror it."
|
||||
(let* ((url (quri:uri url))
|
||||
(path (quri:uri-path url)))
|
||||
(pathname
|
||||
(format nil "~@[~A/~]~A/~A~@[.~A~]"
|
||||
base-dir
|
||||
(quri:uri-host url)
|
||||
(pathname-name path)
|
||||
(pathname-type path)))))
|
||||
|
||||
|
||||
|
||||
;;; DOM-parsing
|
||||
;;; ————————————————————————————————————————
|
||||
(defun linked-urls (dom &optional (tags '("link" "img" "script")))
|
||||
"Return a list of all URLs in the LQuery DOM contained in the given tags’
|
||||
HREF and SRC attributes."
|
||||
(flet ((url-list-of-tag (tag)
|
||||
(remove-if
|
||||
#'not
|
||||
(append
|
||||
(coerce (lquery:$ dom tag (attr "src")) 'list)
|
||||
(coerce (lquery:$ dom tag (attr "href")) 'list)))))
|
||||
(loop for tag in tags
|
||||
append (url-list-of-tag tag))))
|
||||
|
||||
|
||||
(defun node-url (node)
|
||||
"Return the SRC or HREF attribute of an LQuery node, if such a thing exists."
|
||||
(and node
|
||||
(or (lquery-funcs:attr node "src")
|
||||
(lquery-funcs:attr node "href"))))
|
||||
|
||||
|
||||
(defun set-node-url (node url)
|
||||
"Set the SRC or HREF attribute of an LQuery node (based on which attribute is
|
||||
already set)."
|
||||
(and node
|
||||
(if (lquery-funcs:attr node "src")
|
||||
(lquery-funcs:attr node "src" url))
|
||||
(if (lquery-funcs:attr node "href")
|
||||
(lquery-funcs:attr node "href" url))))
|
||||
|
||||
|
||||
|
||||
;;; Util
|
||||
;;; ————————————————————————————————————————
|
||||
(defun http-fetch (url path)
|
||||
"Download a URL to a path; if successful, returns the pathname. Otherwise, NIL."
|
||||
(handler-case
|
||||
(and (dexador:fetch (url-encode-uri (quri:uri url)) path)
|
||||
(pathname path))
|
||||
(file-exists ()
|
||||
path)
|
||||
(error nil)))
|
||||
|
||||
|
||||
(defun url-encode-uri (uri)
|
||||
"URL-encode the path component of a URI. For example,
|
||||
“https://invalid.tld/dad alive.jpg” → “https://invalid.tld/dad%20alive.jpg”"
|
||||
(let ((uri (quri:uri uri)))
|
||||
(format nil "~@[~A://~]~A~@[:~A~]~A"
|
||||
(quri:uri-scheme uri)
|
||||
(quri:uri-host uri)
|
||||
(quri:uri-port uri)
|
||||
(url-encode-path (quri:uri-path uri)))))
|
||||
|
||||
|
||||
(defun url-encode-path (path)
|
||||
"URL-encode a pathname’s components (directory and filenames). For example,
|
||||
“/images!/dad alive.jpg” → “/images%21/dad%20alive.jpg”"
|
||||
(let* ((split (split-sequence:split-sequence #\/ path))
|
||||
(path-parts (if (not (car split)) '("") split)))
|
||||
(reduce
|
||||
(lambda (a b)
|
||||
(format nil "~A/~A" a b))
|
||||
(mapcar #'quri:url-encode path-parts))))
|
Ŝarĝante…
Reference in New Issue