From d475a5bde9bb0fb0b542c019aafc7e33ba647104 Mon Sep 17 00:00:00 2001 From: Hippo Date: Sat, 3 Apr 2021 21:29:01 +0530 Subject: [PATCH] ePub: download and insert article images before rendering This includes the feature image as well as other inline images. --- package.json | 1 + seance.js | 83 ++++++++++++++++++++++++++++++++++--- yarn.lock | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+), 5 deletions(-) diff --git a/package.json b/package.json index cdddde8..bb5adbf 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,7 @@ "babel-preset-env": "^1.7.0", "body-parser": "^1.19.0", "bulma": "^0.8.2", + "cheerio": "^1.0.0-rc.5", "commander": "^3.0.2", "convict": "^5.2.0", "css-loader": "^3.5.3", diff --git a/seance.js b/seance.js index ef168ca..143a690 100644 --- a/seance.js +++ b/seance.js @@ -9,8 +9,10 @@ const readline = require('readline') const { markdown } = require('markdown') const GhostAdminAPI = require('@tryghost/admin-api') const { Canvas, Image } = require('canvas') +const slugify = require('underscore.string/slugify') const Rembrandt = require('rembrandt') const nodepub = require('nodepub') +const cheerio = require('cheerio') const config = require('./config') @@ -742,6 +744,77 @@ class Seance { allPosts.push(post) } + // prepare for image downloads + let pics = [] + let picFolder = path.join(options.outputFolder, 'seance-images') + if (!fs.existsSync(picFolder)) { + fs.mkdirSync(picFolder, { recursive: true }) + } + + // prepare array to collect processed posts + let processedPosts = [] + + for (let post of allPosts) { + // decide a post slug, for future files + let postSlug = slugify(post.title) + + // get the cover pic + let featurePicTag + if (!!post.feature_image) { + let imgUrl = post.feature_image + if (/^\/\//i.test(imgUrl)) { + imgUrl = 'https:' + imgUrl + } else if (!/^https?:\/\//i.test(imgUrl)) { + imgUrl = 'https://' + imgUrl + } + let response = await (await r2.get(imgUrl).response).buffer() + let ext = post.feature_image.split('.').pop() + await await fs.promises.writeFile(path.join(picFolder, `${postSlug}.${ext}`), response, 'base64') + featurePicTag = `` + pics.push(`${picFolder}/${postSlug}.${ext}`) + } + + let c = cheerio.load(`${featurePicTag}

${post.title}

${post.html}`) + + // hunt for other pics + // TODO: make asynchronous + let picCounter = 0 + c('img').each(async function() { + // skip if it's a local image + if (c(this).attr('src').indexOf('../images') == 0) { + return + } + + // first, process the url + let imgUrl = c(this).attr('src') + console.log('Downloading:', imgUrl) + if (/^\/\//i.test(imgUrl)) { + imgUrl = 'https:' + imgUrl + } else if (!/^https?:\/\//i.test(imgUrl)) { + imgUrl = 'https://' + imgUrl + } + + // now decide an output name + let ext = c(this).attr('src').split('.').pop() + let imageFile = path.join(picFolder, `${postSlug}-insert-${picCounter}.${ext}`) + + // note down our calculations + c(this).attr('src', `../images/${postSlug}-insert-${picCounter}.${ext}`) + pics.push(imageFile) + picCounter = picCounter + 1 + + // finally, download the images + let response = await (await r2.get(imgUrl).response).buffer() + await fs.promises.writeFile(imageFile, response, 'base64') + console.log('Downloaded to:', imageFile) + }) + + processedPosts.push({ + title: post.title, + body: c.html(), + }) + } + // decide metadata let metadata = { id: 'seance-test', // FIXME @@ -751,19 +824,19 @@ class Seance { contents: 'Table of Contents', genre: options.genre, cover: options.coverImage, - images: [], + images: pics, } // create the ePub let epub = nodepub.document(metadata) - for (let post of allPosts) { - console.log(`Adding: ${post.title}`) - epub.addSection(post.title, `

${post.title}

${post.html}`) + // add the documents + for (let post of processedPosts) { + epub.addSection(post.title, post.body) } // generate it! - await epub.writeEPUB(options.outputFolder, options.title) // FIXME + await epub.writeEPUB(options.outputFolder, options.title) } } diff --git a/yarn.lock b/yarn.lock index 534d8ff..399c4e7 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1143,6 +1143,11 @@ body-parser@1.19.0, body-parser@^1.19.0: raw-body "2.4.0" type-is "~1.6.17" +boolbase@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e" + integrity sha1-aN/1++YMUes3cl6p4+0xDcwed24= + brace-expansion@^1.1.7: version "1.1.11" resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd" @@ -1444,6 +1449,30 @@ chalk@^2.0.1, chalk@^2.4.1, chalk@^2.4.2: escape-string-regexp "^1.0.5" supports-color "^5.3.0" +cheerio-select-tmp@^0.1.0: + version "0.1.1" + resolved "https://registry.yarnpkg.com/cheerio-select-tmp/-/cheerio-select-tmp-0.1.1.tgz#55bbef02a4771710195ad736d5e346763ca4e646" + integrity sha512-YYs5JvbpU19VYJyj+F7oYrIE2BOll1/hRU7rEy/5+v9BzkSo3bK81iAeeQEMI92vRIxz677m72UmJUiVwwgjfQ== + dependencies: + css-select "^3.1.2" + css-what "^4.0.0" + domelementtype "^2.1.0" + domhandler "^4.0.0" + domutils "^2.4.4" + +cheerio@^1.0.0-rc.5: + version "1.0.0-rc.5" + resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0-rc.5.tgz#88907e1828674e8f9fee375188b27dadd4f0fa2f" + integrity sha512-yoqps/VCaZgN4pfXtenwHROTp8NG6/Hlt4Jpz2FEP0ZJQ+ZUkVDd0hAPDNKhj3nakpfPt/CNs57yEtxD1bXQiw== + dependencies: + cheerio-select-tmp "^0.1.0" + dom-serializer "~1.2.0" + domhandler "^4.0.0" + entities "~2.1.0" + htmlparser2 "^6.0.0" + parse5 "^6.0.0" + parse5-htmlparser2-tree-adapter "^6.0.0" + chokidar@^2.1.8: version "2.1.8" resolved "https://registry.yarnpkg.com/chokidar/-/chokidar-2.1.8.tgz#804b3a7b6a99358c3c5c61e71d8728f041cff917" @@ -1916,6 +1945,17 @@ css-loader@^3.5.3: schema-utils "^2.6.6" semver "^6.3.0" +css-select@^3.1.2: + version "3.1.2" + resolved "https://registry.yarnpkg.com/css-select/-/css-select-3.1.2.tgz#d52cbdc6fee379fba97fb0d3925abbd18af2d9d8" + integrity sha512-qmss1EihSuBNWNNhHjxzxSfJoFBM/lERB/Q4EnsJQQC62R2evJDW481091oAdOr9uh46/0n4nrg0It5cAnj1RA== + dependencies: + boolbase "^1.0.0" + css-what "^4.0.0" + domhandler "^4.0.0" + domutils "^2.4.3" + nth-check "^2.0.0" + css-selector-tokenizer@^0.7.0: version "0.7.2" resolved "https://registry.yarnpkg.com/css-selector-tokenizer/-/css-selector-tokenizer-0.7.2.tgz#11e5e27c9a48d90284f22d45061c303d7a25ad87" @@ -1930,6 +1970,11 @@ css-unit-converter@^1.1.1: resolved "https://registry.yarnpkg.com/css-unit-converter/-/css-unit-converter-1.1.1.tgz#d9b9281adcfd8ced935bdbaba83786897f64e996" integrity sha1-2bkoGtz9jO2TW9urqDeGiX9k6ZY= +css-what@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/css-what/-/css-what-4.0.0.tgz#35e73761cab2eeb3d3661126b23d7aa0e8432233" + integrity sha512-teijzG7kwYfNVsUh2H/YN62xW3KK9YhXEgSlbxMlcyjPNvdKJqFx5lrwlJgoFP1ZHlB89iGDlo/JyshKeRhv5A== + cssesc@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/cssesc/-/cssesc-3.0.0.tgz#37741919903b868565e1c09ea747445cd18983ee" @@ -2146,11 +2191,41 @@ diffie-hellman@^5.0.0: miller-rabin "^4.0.0" randombytes "^2.0.0" +dom-serializer@^1.0.1, dom-serializer@~1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-1.2.0.tgz#3433d9136aeb3c627981daa385fc7f32d27c48f1" + integrity sha512-n6kZFH/KlCrqs/1GHMOd5i2fd/beQHuehKdWvNNffbGHTr/almdhuVvTVFb3V7fglz+nC50fFusu3lY33h12pA== + dependencies: + domelementtype "^2.0.1" + domhandler "^4.0.0" + entities "^2.0.0" + domain-browser@^1.1.1: version "1.2.0" resolved "https://registry.yarnpkg.com/domain-browser/-/domain-browser-1.2.0.tgz#3d31f50191a6749dd1375a7f522e823d42e54eda" integrity sha512-jnjyiM6eRyZl2H+W8Q/zLMA481hzi0eszAaBUzIVnmYVDBbnLxVNnfu1HgEBvCbL+71FrxMl3E6lpKH7Ge3OXA== +domelementtype@^2.0.1, domelementtype@^2.1.0, domelementtype@^2.2.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.2.0.tgz#9a0b6c2782ed6a1c7323d42267183df9bd8b1d57" + integrity sha512-DtBMo82pv1dFtUmHyr48beiuq792Sxohr+8Hm9zoxklYPfa6n0Z3Byjj2IV7bmr2IyqClnqEQhfgHJJ5QF0R5A== + +domhandler@^4.0.0, domhandler@^4.1.0: + version "4.1.0" + resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-4.1.0.tgz#c1d8d494d5ec6db22de99e46a149c2a4d23ddd43" + integrity sha512-/6/kmsGlMY4Tup/nGVutdrK9yQi4YjWVcVeoQmixpzjOUK1U7pQkvAPHBJeUxOgxF0J8f8lwCJSlCfD0V4CMGQ== + dependencies: + domelementtype "^2.2.0" + +domutils@^2.4.3, domutils@^2.4.4: + version "2.5.1" + resolved "https://registry.yarnpkg.com/domutils/-/domutils-2.5.1.tgz#9b8e84b5d9f788499ae77506ea832e9b4f9aa1c0" + integrity sha512-hO1XwHMGAthA/1KL7c83oip/6UWo3FlUNIuWiWKltoiQ5oCOiqths8KknvY2jpOohUoUgnwa/+Rm7UpwpSbY/Q== + dependencies: + dom-serializer "^1.0.1" + domelementtype "^2.2.0" + domhandler "^4.1.0" + dotenv@^8.2.0: version "8.2.0" resolved "https://registry.yarnpkg.com/dotenv/-/dotenv-8.2.0.tgz#97e619259ada750eea3e4ea3e26bceea5424b16a" @@ -2240,6 +2315,16 @@ entities@^1.1.1: resolved "https://registry.yarnpkg.com/entities/-/entities-1.1.2.tgz#bdfa735299664dfafd34529ed4f8522a275fea56" integrity sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w== +entities@^2.0.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/entities/-/entities-2.2.0.tgz#098dc90ebb83d8dffa089d55256b351d34c4da55" + integrity sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A== + +entities@~2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/entities/-/entities-2.1.0.tgz#992d3129cf7df6870b96c57858c249a120f8b8b5" + integrity sha512-hCx1oky9PFrJ611mf0ifBLBRW8lUUVRlFolb5gWRfIELabBlbp9xZvrqZLZAs+NxFnbfQoeGd8wDkygjg7U85w== + errno@^0.1.3, errno@~0.1.7: version "0.1.7" resolved "https://registry.yarnpkg.com/errno/-/errno-0.1.7.tgz#4684d71779ad39af177e3f007996f7c67c852618" @@ -2899,6 +2984,16 @@ html-comment-regex@^1.1.0: resolved "https://registry.yarnpkg.com/html-comment-regex/-/html-comment-regex-1.1.2.tgz#97d4688aeb5c81886a364faa0cad1dda14d433a7" integrity sha512-P+M65QY2JQ5Y0G9KKdlDpo0zK+/OHptU5AaBwUfAIDJZk1MYf32Frm84EcOytfJE0t5JvkAnKlmjsXDnWzCJmQ== +htmlparser2@^6.0.0: + version "6.0.1" + resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-6.0.1.tgz#422521231ef6d42e56bd411da8ba40aa36e91446" + integrity sha512-GDKPd+vk4jvSuvCbyuzx/unmXkk090Azec7LovXP8as1Hn8q9p3hbjmDGbUqqhknw0ajwit6LiiWqfiTUPMK7w== + dependencies: + domelementtype "^2.0.1" + domhandler "^4.0.0" + domutils "^2.4.4" + entities "^2.0.0" + http-errors@1.7.2: version "1.7.2" resolved "https://registry.yarnpkg.com/http-errors/-/http-errors-1.7.2.tgz#4f5029cf13239f31036e5b2e55292bcfbcc85c8f" @@ -4222,6 +4317,13 @@ npm-packlist@^1.1.6: gauge "~2.7.3" set-blocking "~2.0.0" +nth-check@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-2.0.0.tgz#1bb4f6dac70072fc313e8c9cd1417b5074c0a125" + integrity sha512-i4sc/Kj8htBrAiH1viZ0TgU8Y5XqCaV/FziYK6TBczxmeKm3AEFWqqF3195yKudrarqy7Zu80Ra5dobFjn9X/Q== + dependencies: + boolbase "^1.0.0" + null-loader@^0.1.1: version "0.1.1" resolved "https://registry.yarnpkg.com/null-loader/-/null-loader-0.1.1.tgz#17be9abfcd3ff0e1512f6fc4afcb1f5039378fae" @@ -4409,6 +4511,18 @@ parse-json@^4.0.0: error-ex "^1.3.1" json-parse-better-errors "^1.0.1" +parse5-htmlparser2-tree-adapter@^6.0.0: + version "6.0.1" + resolved "https://registry.yarnpkg.com/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-6.0.1.tgz#2cdf9ad823321140370d4dbf5d3e92c7c8ddc6e6" + integrity sha512-qPuWvbLgvDGilKc5BoicRovlT4MtYT6JfJyBOMDsKoiT+GiuP5qyrPCnR9HcPECIJJmZh5jRndyNThnhhb/vlA== + dependencies: + parse5 "^6.0.1" + +parse5@^6.0.0, parse5@^6.0.1: + version "6.0.1" + resolved "https://registry.yarnpkg.com/parse5/-/parse5-6.0.1.tgz#e1a1c085c569b3dc08321184f19a39cc27f7c30b" + integrity sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw== + parseurl@~1.3.3: version "1.3.3" resolved "https://registry.yarnpkg.com/parseurl/-/parseurl-1.3.3.tgz#9da19e7bee8d12dff0513ed5b76957793bc2e8d4"