From 19e01c762d027348dc8a1182091bdc0585f115b9 Mon Sep 17 00:00:00 2001 From: Jaidyn Ann <10477760+JadedCtrl@users.noreply.github.com> Date: Fri, 31 May 2024 22:29:47 -0500 Subject: [PATCH] Add --source & :HTML-URL to resolve relative URLs Now, relative URLs in the input file can be mirrored in relation to a given URL. --- src/mirror-img.lisp | 28 +++++++++++++++++++--------- src/unix.lisp | 25 ++++++++++++++++++++----- t/mirror-img.lisp | 11 +++++++---- t/testing-website/index.html | 2 +- 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/src/mirror-img.lisp b/src/mirror-img.lisp index 2f4f67d..a417f4e 100644 --- a/src/mirror-img.lisp +++ b/src/mirror-img.lisp @@ -28,19 +28,25 @@ ;;; Mirror-img ;;; ———————————————————————————————————————— -;; pathname/input-stream pathname list → string -(defun mirror-img (html-source download-dir &key (url-dir download-dir) (tags *default-tags*)) +;; pathname/input-stream pathname string string list → string +(defun mirror-img (html-source download-dir + &key html-url + (url-dir download-dir) + (tags *default-tags*)) "Attempt to mirror all remote HREF/SRC URLs of an HTML document’s tags, downloading them to DOWNLOAD-DIR. For each remote URL that could be successfuly mirrored, the remote URLs will be replaced with the local version. The URL used will will be BASE-DIR plus the filename, where BASE-DIR defaults to DOWNLOAD-DIR. HTML-SOURCE, the document, can be either a pathname or an input stream. +If HTML-URL is provided, then relative URLs in the file will be interpreted +in relation to HTML-URL. Returns a string of the modified page’s HTML." (let* ((dom (lquery:$ (lquery:initialize html-source))) (url→pathname-alist - (mirror-linked-urls dom :tags tags :download-dir download-dir)) + (mirror-linked-urls dom :tags tags :download-dir download-dir + :html-url html-url)) (url→relative-url-alist (mapcar (lambda (pair) @@ -53,7 +59,7 @@ Returns a string of the modified page’s HTML." (aref (lquery:$ dom (serialize)) 0))) ;; lquery-dom list pathname → alist -(defun mirror-linked-urls (dom &key (tags *default-tags*) download-dir) +(defun mirror-linked-urls (dom &key (tags *default-tags*) download-dir (html-url "")) "Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery DOM, downloading them to either the current-working directory or the DOWNLOAD-DIR. Returns an associative list containing pairs of remote-URLs to local pathnames. @@ -62,12 +68,16 @@ For example: ((“https://downloaded.com/a/b.jpg” . #p“a/b.jpg”))" (remove-if #'not - (mapcar (lambda (url) - (let ((download-path - (ensure-directories-exist - (mirrored-pathname url :base-dir download-dir)))) + (mapcar (lambda (original-url) + (let* ((url (if (not (quri:uri-scheme (quri:uri original-url))) + (quri:merge-uris original-url html-url) ; For relative URLs + original-url)) + + (download-path + (ensure-directories-exist + (mirrored-pathname url :base-dir download-dir)))) (if (http-fetch url download-path) - (cons url download-path)))) + (cons original-url download-path)))) (linked-urls dom :tags tags)))) ;; lquery-dom alist list → nil diff --git a/src/unix.lisp b/src/unix.lisp index 2eaa2c6..0a41a67 100644 --- a/src/unix.lisp +++ b/src/unix.lisp @@ -41,7 +41,8 @@ (when-opt opts :help (help)) (let* ((input-stream (choose-input-stream (car free))) (url-base (or (getf opts :url-base) (getf opts :download-dir) "mirror/")) - (download-dir (or (getf opts :download-dir) url-base))) + (download-dir (or (getf opts :download-dir) url-base)) + (source-url (getf opts :source-url))) (when (not input-stream) (error-print 1 "No HTML file provided. See --help for more information.")) (format @@ -49,7 +50,8 @@ (mirror-img:mirror-img input-stream download-dir - :url-dir url-base)))) + :url-dir url-base + :html-url source-url)))) (error (c) (error-print 99 nil c)))) @@ -59,7 +61,20 @@ :short #\h :long "help") (:name :url-base :description "path to mirror directory used in URLs" - :short #\b :long "base") + :short #\b :long "base" + :arg-parser (lambda (base) base)) + (:name :source-url + :description "URL used to resolve & mirror relative URLs" + :short #\s :long "source" + :arg-parser + (lambda (source) + (handler-case + (let ((uri (quri:uri source))) + (if (not (quri:uri-scheme uri)) + (error-print 6 "Source URL is not proper (e.g., “https://invalid.tld/”).") + uri)) + (error (c) + (error-print 6 "Source URL is not proper (e.g., “https://invalid.tld/”)." c))))) (:name :download-dir :description "directory for all mirrored files" :short #\d :long "downloads" @@ -79,8 +94,8 @@ "Prints help message and dies." (unix-opts:describe :prefix (format nil "~A~%~A" - "usage: mirror-img [-h] [-d DIR] [-b BASE] HTML_FILE" - " mirror-img [-h] [-d DIR] [-b BASE]") + "usage: mirror-img [-h] [-d DIR] [-b BASE] [-s URL] HTML_FILE" + " mirror-img [-h] [-d DIR] [-b BASE] [-s URL]") :stream stream) (unix-opts:exit exit-code)) diff --git a/t/mirror-img.lisp b/t/mirror-img.lisp index 08e9c05..069556b 100644 --- a/t/mirror-img.lisp +++ b/t/mirror-img.lisp @@ -51,10 +51,13 @@ system’s path." (setq *mirror-img-result* (let* ((html-path (relative-pathname "t/testing-website/index.html")) (download-dir (relative-pathname "t/tmp/")) + (html-url "http://localhost:4242") (url-dir (pathname-utils:relative-pathname (relative-pathname "t/") download-dir))) - (mirror-img:mirror-img html-path download-dir :url-dir url-dir)))))) + (mirror-img:mirror-img html-path download-dir + :url-dir url-dir + :html-url html-url)))))) (define-test mirror-img.files-mirrored (:tags '(mirror-img)) (assert-equal @@ -90,13 +93,13 @@ system’s path." (define-test linked-urls (:tags '(dom)) (assert-equal - '("http://localhost:4242/res/img/b/fireplace.jpg" + '("/res/welcoming you.jpg" + "http://localhost:4242/res/img/b/fireplace.jpg" "http://localhost:4242/res/img/b/I’m trying hard to randomly name these directories/more_calming.jpg" "http://localhost:4242/res/img/b/I’m trying hard to randomly name these directories/ĉu ĉi tio sufiĉe hazardas?!/classy_fireplace.jpg" "http://localhost:4242/res/img/level-2/café.jpg" "http://localhost:4242/res/img/merry christmas!!! ^_^.jpg" - "http://localhost:4242/res/style.css" - "http://localhost:4242/res/welcoming you.jpg") + "http://localhost:4242/res/style.css") (sort (mirror-img::linked-urls (lquery:$ (lquery:initialize (relative-pathname "t/testing-website/index.html")))) diff --git a/t/testing-website/index.html b/t/testing-website/index.html index 3c88d4a..a7add81 100644 --- a/t/testing-website/index.html +++ b/t/testing-website/index.html @@ -11,7 +11,7 @@

A sultry welcome to you

- +
Welcome~ 🔗