Compare commits
2 Enmetoj
f22dbba40d
...
57bad39463
Author | SHA1 | Date | |
---|---|---|---|
Jaidyn Ann | 57bad39463 | ||
Jaidyn Ann | 19e01c762d |
31
README.md
31
README.md
|
@ -6,11 +6,12 @@ file, mirroring its remote images, stylesheets, and other resources.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
```
|
```
|
||||||
usage: mirror-img [-h] [-d DIR] [-b BASE] HTML_FILE
|
usage: mirror-img [-h] [-d DIR] [-b BASE] [-s URL] HTML_FILE
|
||||||
mirror-img [-h] [-d DIR] [-b BASE]
|
mirror-img [-h] [-d DIR] [-b BASE] [-s URL]
|
||||||
Available options:
|
Available options:
|
||||||
-h, --help print this help text.
|
-h, --help print this help text.
|
||||||
-b, --base path to mirror directory used in URLs
|
-b, --base ARG path to mirror directory used in URLs
|
||||||
|
-s, --source ARG URL used to resolve & mirror relative URLs
|
||||||
-d, --downloads ARG directory for all mirrored files
|
-d, --downloads ARG directory for all mirrored files
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -21,12 +22,28 @@ In order to mirror a webpage, you can simply download it and pipe it into mirror
|
||||||
$ curl https://www.gnu.org/philosophy/philosophy.html | mirror-img > philosophy.html
|
$ curl https://www.gnu.org/philosophy/philosophy.html | mirror-img > philosophy.html
|
||||||
```
|
```
|
||||||
|
|
||||||
And now `philosophy.html` will be a fully-local HTML file, with no external resources!
|
And now `philosophy.html` is a fully-local HTML file with no external resources!
|
||||||
All mirrored content will be found in the `mirror/` directory, and all links
|
|
||||||
have been adjusted accordingly.
|
… at least, it *would* be. Notice how some resources, like the CSS, don’t load.
|
||||||
|
This is because they are defined as *relative* links (e.g., “../style.css”
|
||||||
|
rather than “https://invalid.tld/style.css”). In order for these to be
|
||||||
|
mirrored as well, mirror-img needs to somehow know the source URL.
|
||||||
|
|
||||||
|
You can use the `--source` argument to provide the source URL, so
|
||||||
|
relatively-linked resources can be mirrored, too:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ SOURCE_URL="https://www.gnu.org/philosophy/philosophy.html"
|
||||||
|
$ curl "$SOURCE_URL" | mirror-img --source "$SOURCE_URL" > philosophy.html
|
||||||
|
```
|
||||||
|
|
||||||
|
*Now* we’re done! All mirrored content will be found in the `mirror/`
|
||||||
|
directory, and all links have been adjusted accordingly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
If you’d like to change the download directory, you can use the `--downloads`
|
If you’d like to change the download directory, you can use the `--downloads`
|
||||||
argument. To change the directory used in the output HTML’s URLs, you can
|
argument. To change the directory used in the output-HTML’s URLs, you can
|
||||||
use `--base`.
|
use `--base`.
|
||||||
|
|
||||||
For example, if you’d like to mirror files into `/tmp/mirrors/` but have URLs
|
For example, if you’d like to mirror files into `/tmp/mirrors/` but have URLs
|
||||||
|
|
|
@ -28,19 +28,25 @@
|
||||||
|
|
||||||
;;; Mirror-img
|
;;; Mirror-img
|
||||||
;;; ————————————————————————————————————————
|
;;; ————————————————————————————————————————
|
||||||
;; pathname/input-stream pathname list → string
|
;; pathname/input-stream pathname string string list → string
|
||||||
(defun mirror-img (html-source download-dir &key (url-dir download-dir) (tags *default-tags*))
|
(defun mirror-img (html-source download-dir
|
||||||
|
&key html-url
|
||||||
|
(url-dir download-dir)
|
||||||
|
(tags *default-tags*))
|
||||||
"Attempt to mirror all remote HREF/SRC URLs of an HTML document’s tags,
|
"Attempt to mirror all remote HREF/SRC URLs of an HTML document’s tags,
|
||||||
downloading them to DOWNLOAD-DIR. For each remote URL that could be
|
downloading them to DOWNLOAD-DIR. For each remote URL that could be
|
||||||
successfuly mirrored, the remote URLs will be replaced with the local
|
successfuly mirrored, the remote URLs will be replaced with the local
|
||||||
version. The URL used will will be BASE-DIR plus the filename, where
|
version. The URL used will will be BASE-DIR plus the filename, where
|
||||||
BASE-DIR defaults to DOWNLOAD-DIR.
|
BASE-DIR defaults to DOWNLOAD-DIR.
|
||||||
HTML-SOURCE, the document, can be either a pathname or an input stream.
|
HTML-SOURCE, the document, can be either a pathname or an input stream.
|
||||||
|
If HTML-URL is provided, then relative URLs in the file will be interpreted
|
||||||
|
in relation to HTML-URL.
|
||||||
Returns a string of the modified page’s HTML."
|
Returns a string of the modified page’s HTML."
|
||||||
(let* ((dom
|
(let* ((dom
|
||||||
(lquery:$ (lquery:initialize html-source)))
|
(lquery:$ (lquery:initialize html-source)))
|
||||||
(url→pathname-alist
|
(url→pathname-alist
|
||||||
(mirror-linked-urls dom :tags tags :download-dir download-dir))
|
(mirror-linked-urls dom :tags tags :download-dir download-dir
|
||||||
|
:html-url html-url))
|
||||||
(url→relative-url-alist
|
(url→relative-url-alist
|
||||||
(mapcar
|
(mapcar
|
||||||
(lambda (pair)
|
(lambda (pair)
|
||||||
|
@ -53,7 +59,7 @@ Returns a string of the modified page’s HTML."
|
||||||
(aref (lquery:$ dom (serialize)) 0)))
|
(aref (lquery:$ dom (serialize)) 0)))
|
||||||
|
|
||||||
;; lquery-dom list pathname → alist
|
;; lquery-dom list pathname → alist
|
||||||
(defun mirror-linked-urls (dom &key (tags *default-tags*) download-dir)
|
(defun mirror-linked-urls (dom &key (tags *default-tags*) download-dir (html-url ""))
|
||||||
"Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery
|
"Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery
|
||||||
DOM, downloading them to either the current-working directory or the DOWNLOAD-DIR.
|
DOM, downloading them to either the current-working directory or the DOWNLOAD-DIR.
|
||||||
Returns an associative list containing pairs of remote-URLs to local pathnames.
|
Returns an associative list containing pairs of remote-URLs to local pathnames.
|
||||||
|
@ -62,12 +68,16 @@ For example:
|
||||||
((“https://downloaded.com/a/b.jpg” . #p“a/b.jpg”))"
|
((“https://downloaded.com/a/b.jpg” . #p“a/b.jpg”))"
|
||||||
(remove-if
|
(remove-if
|
||||||
#'not
|
#'not
|
||||||
(mapcar (lambda (url)
|
(mapcar (lambda (original-url)
|
||||||
(let ((download-path
|
(let* ((url (if (not (quri:uri-scheme (quri:uri original-url)))
|
||||||
(ensure-directories-exist
|
(quri:merge-uris original-url html-url) ; For relative URLs
|
||||||
(mirrored-pathname url :base-dir download-dir))))
|
original-url))
|
||||||
|
|
||||||
|
(download-path
|
||||||
|
(ensure-directories-exist
|
||||||
|
(mirrored-pathname url :base-dir download-dir))))
|
||||||
(if (http-fetch url download-path)
|
(if (http-fetch url download-path)
|
||||||
(cons url download-path))))
|
(cons original-url download-path))))
|
||||||
(linked-urls dom :tags tags))))
|
(linked-urls dom :tags tags))))
|
||||||
|
|
||||||
;; lquery-dom alist list → nil
|
;; lquery-dom alist list → nil
|
||||||
|
|
|
@ -41,7 +41,8 @@
|
||||||
(when-opt opts :help (help))
|
(when-opt opts :help (help))
|
||||||
(let* ((input-stream (choose-input-stream (car free)))
|
(let* ((input-stream (choose-input-stream (car free)))
|
||||||
(url-base (or (getf opts :url-base) (getf opts :download-dir) "mirror/"))
|
(url-base (or (getf opts :url-base) (getf opts :download-dir) "mirror/"))
|
||||||
(download-dir (or (getf opts :download-dir) url-base)))
|
(download-dir (or (getf opts :download-dir) url-base))
|
||||||
|
(source-url (getf opts :source-url)))
|
||||||
(when (not input-stream)
|
(when (not input-stream)
|
||||||
(error-print 1 "No HTML file provided. See --help for more information."))
|
(error-print 1 "No HTML file provided. See --help for more information."))
|
||||||
(format
|
(format
|
||||||
|
@ -49,7 +50,8 @@
|
||||||
(mirror-img:mirror-img
|
(mirror-img:mirror-img
|
||||||
input-stream
|
input-stream
|
||||||
download-dir
|
download-dir
|
||||||
:url-dir url-base))))
|
:url-dir url-base
|
||||||
|
:html-url source-url))))
|
||||||
(error (c)
|
(error (c)
|
||||||
(error-print 99 nil c))))
|
(error-print 99 nil c))))
|
||||||
|
|
||||||
|
@ -59,7 +61,20 @@
|
||||||
:short #\h :long "help")
|
:short #\h :long "help")
|
||||||
(:name :url-base
|
(:name :url-base
|
||||||
:description "path to mirror directory used in URLs"
|
:description "path to mirror directory used in URLs"
|
||||||
:short #\b :long "base")
|
:short #\b :long "base"
|
||||||
|
:arg-parser (lambda (base) base))
|
||||||
|
(:name :source-url
|
||||||
|
:description "URL used to resolve & mirror relative URLs"
|
||||||
|
:short #\s :long "source"
|
||||||
|
:arg-parser
|
||||||
|
(lambda (source)
|
||||||
|
(handler-case
|
||||||
|
(let ((uri (quri:uri source)))
|
||||||
|
(if (not (quri:uri-scheme uri))
|
||||||
|
(error-print 6 "Source URL is not proper (e.g., “https://invalid.tld/”).")
|
||||||
|
uri))
|
||||||
|
(error (c)
|
||||||
|
(error-print 6 "Source URL is not proper (e.g., “https://invalid.tld/”)." c)))))
|
||||||
(:name :download-dir
|
(:name :download-dir
|
||||||
:description "directory for all mirrored files"
|
:description "directory for all mirrored files"
|
||||||
:short #\d :long "downloads"
|
:short #\d :long "downloads"
|
||||||
|
@ -79,8 +94,8 @@
|
||||||
"Prints help message and dies."
|
"Prints help message and dies."
|
||||||
(unix-opts:describe
|
(unix-opts:describe
|
||||||
:prefix (format nil "~A~%~A"
|
:prefix (format nil "~A~%~A"
|
||||||
"usage: mirror-img [-h] [-d DIR] [-b BASE] HTML_FILE"
|
"usage: mirror-img [-h] [-d DIR] [-b BASE] [-s URL] HTML_FILE"
|
||||||
" mirror-img [-h] [-d DIR] [-b BASE]")
|
" mirror-img [-h] [-d DIR] [-b BASE] [-s URL]")
|
||||||
:stream stream)
|
:stream stream)
|
||||||
(unix-opts:exit exit-code))
|
(unix-opts:exit exit-code))
|
||||||
|
|
||||||
|
|
|
@ -51,10 +51,13 @@ system’s path."
|
||||||
(setq *mirror-img-result*
|
(setq *mirror-img-result*
|
||||||
(let* ((html-path (relative-pathname "t/testing-website/index.html"))
|
(let* ((html-path (relative-pathname "t/testing-website/index.html"))
|
||||||
(download-dir (relative-pathname "t/tmp/"))
|
(download-dir (relative-pathname "t/tmp/"))
|
||||||
|
(html-url "http://localhost:4242")
|
||||||
(url-dir (pathname-utils:relative-pathname
|
(url-dir (pathname-utils:relative-pathname
|
||||||
(relative-pathname "t/")
|
(relative-pathname "t/")
|
||||||
download-dir)))
|
download-dir)))
|
||||||
(mirror-img:mirror-img html-path download-dir :url-dir url-dir))))))
|
(mirror-img:mirror-img html-path download-dir
|
||||||
|
:url-dir url-dir
|
||||||
|
:html-url html-url))))))
|
||||||
|
|
||||||
(define-test mirror-img.files-mirrored (:tags '(mirror-img))
|
(define-test mirror-img.files-mirrored (:tags '(mirror-img))
|
||||||
(assert-equal
|
(assert-equal
|
||||||
|
@ -90,13 +93,13 @@ system’s path."
|
||||||
|
|
||||||
(define-test linked-urls (:tags '(dom))
|
(define-test linked-urls (:tags '(dom))
|
||||||
(assert-equal
|
(assert-equal
|
||||||
'("http://localhost:4242/res/img/b/fireplace.jpg"
|
'("/res/welcoming you.jpg"
|
||||||
|
"http://localhost:4242/res/img/b/fireplace.jpg"
|
||||||
"http://localhost:4242/res/img/b/I’m trying hard to randomly name these directories/more_calming.jpg"
|
"http://localhost:4242/res/img/b/I’m trying hard to randomly name these directories/more_calming.jpg"
|
||||||
"http://localhost:4242/res/img/b/I’m trying hard to randomly name these directories/ĉu ĉi tio sufiĉe hazardas?!/classy_fireplace.jpg"
|
"http://localhost:4242/res/img/b/I’m trying hard to randomly name these directories/ĉu ĉi tio sufiĉe hazardas?!/classy_fireplace.jpg"
|
||||||
"http://localhost:4242/res/img/level-2/café.jpg"
|
"http://localhost:4242/res/img/level-2/café.jpg"
|
||||||
"http://localhost:4242/res/img/merry christmas!!! ^_^.jpg"
|
"http://localhost:4242/res/img/merry christmas!!! ^_^.jpg"
|
||||||
"http://localhost:4242/res/style.css"
|
"http://localhost:4242/res/style.css")
|
||||||
"http://localhost:4242/res/welcoming you.jpg")
|
|
||||||
(sort (mirror-img::linked-urls
|
(sort (mirror-img::linked-urls
|
||||||
(lquery:$ (lquery:initialize
|
(lquery:$ (lquery:initialize
|
||||||
(relative-pathname "t/testing-website/index.html"))))
|
(relative-pathname "t/testing-website/index.html"))))
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
<h1>A sultry welcome to you</h1>
|
<h1>A sultry welcome to you</h1>
|
||||||
|
|
||||||
<figure>
|
<figure>
|
||||||
<img src="http://localhost:4242/res/welcoming you.jpg">
|
<img src="/res/welcoming you.jpg">
|
||||||
<figcaption>Welcome~
|
<figcaption>Welcome~
|
||||||
<a href="https://twitter.com/Distrbeachboy/status/1758538124785852426">
|
<a href="https://twitter.com/Distrbeachboy/status/1758538124785852426">
|
||||||
🔗
|
🔗
|
||||||
|
|
Ŝarĝante…
Reference in New Issue