2025-12-24 20:09:51 -08:00
tring ' ;
import { Readable } from 'node:stream' ;
import { finished } from 'node:stream/promises' ;
import { URL } from 'node:url' ;
import util from 'node:util' ;
const exec = util . promisify ( child_process . exec ) ;
import { Readability } from '@mozilla/readability' ;
import FileCookieStore from '@root/file-cookie-store' ;
import merge from 'deepmerge' ;
import { createCursor , getRandomPagePoint } from 'ghost-cursor' ;
import { JSDOM , VirtualConsole } from 'jsdom' ;
import mime from 'mime-types' ;
import ToughCookie from 'tough-cookie' ;
import unzip from 'unzip-crx-3' ;
import puppeteer from 'puppeteer' ;
import { Browser , Page , Cookie , HTTPResponse } from 'puppeteer' ;
import { Cluster } from 'puppeteer-cluster' ;
import PupeteerExtra from "puppeteer-extra" ;
import Stealth # ! / u s r / b i n / e n v n o d e - - e n v - f i l e . e n v
// https://gist.github.com/pirate/d9a350e83025a1e6cf452cddd815d0d4
// npm install request node-request minimist deepmerge mime-types decompress puppeteer-extra puppeteer-extra-plugin-repl puppeteer-extra-plugin-user-preferences puppeteer-extra-plugin-recaptcha puppeteer-extra-plugin-stealth puppeteer-screen-recorder puppeteer-cluster ghost-cursor @mozilla/readability jsdom unzip-crx-3 node-fetch@2
import assert from 'node:assert/strict' ;
import { Buffer } from 'node:buffer' ;
import child_process from 'node:child_process' ;
import crypto from 'node:crypto' ;
import fs from 'node:fs' ;
import { createServer } from 'node:http' ;
import os from 'node:os' ;
import path from 'node:path' ;
import querystring from ' node :querysPlugin from "puppeteer-extra-plugin-stealth" ;
import PrefsPlugin from 'puppeteer-extra-plugin-user-preferences' ;
import { PuppeteerScreenRecorder } from 'puppeteer-screen-recorder' ;
// import RecaptchaPlugin from 'puppeteer-extra-plugin-recaptcha';
// import ReplPlugin from 'puppeteer-extra-plugin-repl';
const __dirname = import . meta . dirname
import { getDatabase } from './models/init-models.js' ;
const { Tag , Snapshot , ArchiveResult } = await getDatabase ( { dbpath : './index.sqlite3' } )
// move mitm CA cert into /usr/local/share/ca-certificates/mitmproxy-ca-cert.crt
// update-ca-certificates
const ANSI = {
reset : "\x1b[0m" ,
blue : "\x1b[34m" ,
black : "\x1b[30m" ,
}
/************************* Main Input Arguments *******************************/
let URLS = [
// 'chrome://about',
// 'chrome://system/#chrome_root_store',
'https://facebook.com/815781663692514/?comment_id=1508571679703640' ,
'https://www.instagram.com/p/CrTY1fENHr5/' ,
'https://www.tiktok.com/@zemmour_eric/video/7342474065598319904?cid=7343316616878490400' ,
'https://twitter.com/DZasken68678/status/1799833933271687304' ,
'https://t.me/IONONMIARRENDOGROUP/13598' ,
'https://www.youtube.com/watch?v=rpD0qgzlCms' ,
'https://www.aap.com.au/factcheck/aboriginal-lands-claim-a-total-abdication-of-facts/' ,
'https://gologin.com/check-browser' ,
'https://arh.antoinevastel.com/bots/areyouheadless' ,
'https://2captcha.com/demo/hcaptcha' ,
'https://2captcha.com/demo/cloudflare-turnstile' ,
'https://2captcha.com/demo/recaptcha-v3' ,
'https://ipinfo.io/' ,
// 'https://2captcha.com/demo/recaptcha-v2',
// 'https://2captcha.com/demo/keycaptcha',
// 'https://browserleaks.com/canvas',
// 'https://bot.incolumitas.com/#botChallenge',
// 'https://infosimples.github.io/detect-headless/',
// 'https://coveryourtracks.eff.org/',
// 'https://fingerprint.com/demo/',
// 'https://nowsecure.nl',
// 'https://abrahamjuliot.github.io/creepjs/',
// 'https://scrapfly.io/web-scraping-tools/http2-fingerprint',
// 'https://scrapfly.io/web-scraping-tools/browser-fingerprint',
// 'https://scrapfly.io/web-scraping-tools/ja3-fingerprint',
// 'https://scrapfly.io/web-scraping-tools/canvas-fingerprint',
// 'https://scrapfly.io/web-scraping-tools/webgl-fingerprint',
// 'https://scrapfly.io/web-scraping-tools/audio-fingerprint',
// 'https://scrapfly.io/web-scraping-tools/screen-fingerprint',
// 'https://web-scraping.dev/',
// 'https://example.com',
// 'https://www.okta.com/',
// 'https://www.webflow.com/',
// 'https://docker-compose.archivebox.io',
// 'https://www.reddit.com/r/AskReddit/comments/1br0q9b/what_was_ok_10_years_ago_but_isnt_today/',
// 'https://www.quora.com/Is-the-website-2Captcha-true-or-fake-with-paying-money-for-working-on-it',
// 'https://x.com/yawnzzcalo7/status/1747853178849435894',
// 'https://twitter.com/yawnzzcalo7/status/1747853178849435894',
// 'https://rachdele.substack.com/p/is-the-job-market-dying',
// 'https://www.flowradar.com/cloneables/mouse-image-trail-effect',
// 'https://wrong.host.badssl.com/',
// 'http://docker-compose.archivebox.io',
// 'https://pptr.dev/api/puppeteer.page.setrequestinterception',
// 'https://blog.sweeting.me#Writing',
// 'https://github.com/yarnpkg/yarn/issues/9005',
// 'https://archive.md/739Oc',
// 'https://archive.md/Oc72d',
// 'https://archive.vn/fPUBe',
// 'https://archive.vn/mRz4P',
// 'https://archive.vn/Qct6Y',
// 'https://archive.vn/sv50h',
// 'https://facebook.com/815781663692514/?comment_id=1508571679703640',
// 'https://facebook.com/815781663692514/?comment_id=924451748966499',
// 'https://www.facebook.com/wayne.brennan.528/posts/pfbid02fvxFppng2WsHMavhBa62cXizCBGdmPQRH3CMhac79qzS5C1ADaSNC587d3u6qVbkl',
// 'https://www.facebook.com/wildeprods/posts/pfbid02YEPfoB7pZqMNzE4y2MpYSQbRAzASquvHyEMzHqrNngJCSL7onEg2jnsqS6epcQHWl',
// 'https://t.me/aubontouite_francais/9493',
// 'https://t.me/BC_BLACKMIROR/5044',
// 'https://t.me/IONONMIARRENDOGROUP/14004',
// 'https://t.me/newsfactory_pl/51014',
// 'https://t.me/oliverjanich/132574',
// 'https://t.me/tomaszgryguc/10449',
// 'https://t.me/amigosDisidentes/123177',
// 'https://twitter.com/1nfiltr4do_NN/status/1767238399943991389',
// 'https://twitter.com/4lmondcookie/status/1748519205438111914',
// 'https://twitter.com/4olll1ke/status/1753796944827199766',
// 'https://twitter.com/yeokiloss/status/1754908226179502345',
// 'https://twitter.com/YoungWaifLover/status/1735667278090297561',
// 'https://twitter.com/Z_Pour_Demain/status/1766133730278605182',
// 'https://www.aap.com.au/factcheck/aboriginal-lands-claim-a-total-abdication-of-facts/',
// 'https://www.aap.com.au/factcheck/absurd-albanese-clip-fools-voice-voters/',
// 'https://www.instagram.com/_the.forgotten.ones/p/CQQDyoqhsF6/',
// 'https://www.instagram.com/p/CqSM_f9MR4b/',
// 'https://www.instagram.com/p/CqSQgf1sv8B/',
// 'https://instagram.com/p/B-Q22Z_pxyC/',
// 'https://www.tiktok.com/@zitatezurzeit/photo/7342474065598319904?cid=7343316616878490400',
// 'https://tiktok.com/@zitatezurzeit/photo/7342474065598319904?cid=7343316616878490400',
// 'https://www.youtube.com/watch?v=rpD0qgzlCms',
]
const isTruthy = ( env_value ) = > [ '1' , 'yes' , 'true' ] . includes ( env_value ? . toLowerCase ( ) || 'false' )
/********************** Config: General High-Level Options ********************/
const PASSIVE_ARCHIVING = isTruthy ( process . env . PASSIVE_ARCHIVING )
const CHROME_CLUSTER = isTruthy ( process . env . CHROME_CLUSTER )
const CHROME_CLUSTER_WORKERS = 4
const API_SERVER_HOST = '0.0.0.0'
const API_SERVER_PORT = 9595
const CHROME_DEBUG_PORT = 9222 // 9222 is default, or use 0 for random port
/********************** Config: Keys & Secrets ********************************/
const API_KEY_2CAPTCHA = process . env . API_KEY_2CAPTCHA || 'YOUR_API_KEY_HERE'
const FLARESOLVERR_API_ENDPOINT = process . env . FLARESOLVERR_API_ENDPOINT || "http://localhost:8191/v1"
const ACTIVE_PERSONA = process . env . ACTIVE_PERSONA || 'Default'
const CHROME_PROFILE_USER = process . env . CHROME_PROFILE_USER || 'Default'
const LOAD_AUTH_STORAGE = isTruthy ( process . env . LOAD_AUTH_STORAGE )
const SAVE_AUTH_STORAGE = isTruthy ( process . env . SAVE_AUTH_STORAGE )
/********************** Config: Data Dir Locations ****************************/
const SRC_DIR = path . resolve ( __dirname )
const DATA_DIR = process . env . DATA_DIR || await fs . promises . realpath ( path . join ( SRC_DIR , 'data' ) )
const INDEXES_DIR = path . join ( DATA_DIR , 'index' )
const ARCHIVE_DIR = path . join ( DATA_DIR , 'archive' )
if ( ! fs . existsSync ( ARCHIVE_DIR ) )
throw 'Could not find data/archive, are you running in the right pwd?'
const PERSONA_DIR = path . join ( DATA_DIR , 'personas' , ACTIVE_PERSONA )
const CHROME_PROFILE_PATH = path . join ( PERSONA_DIR , 'chrome_profile' )
const CHROME_DOWNLOADS_DIR = path . join ( PERSONA_DIR , 'chrome_downloads' )
const CHROME_EXTENSIONS_DIR = path . join ( PERSONA_DIR , 'chrome_extensions' )
const CHROME_EXTENSIONS_JSON_PATH = path . join ( CHROME_EXTENSIONS_DIR , 'extensions.json' )
const AUTH_JSON_PATH = path . join ( PERSONA_DIR , 'auth.json' )
const COOKIES_TXT_PATH = path . join ( PERSONA_DIR , 'cookies.txt' )
const SPEEDTESTS_DIR = path . join ( PERSONA_DIR , 'speedtests' )
// const CHROME_PROFILE_IMPORT_USER = 'Profile 1'
// const CHROME_PROFILE_IMPORT_PATH = '/Volumes/NVME/Users/squash/Library/Application Support/Google/Chrome'
// chrome profile / persona directories
fs . mkdirSync ( PERSONA_DIR , { recursive : true } )
fs . mkdirSync ( SPEEDTESTS_DIR , { recursive : true } )
fs . mkdirSync ( CHROME_PROFILE_PATH , { recursive : true } )
fs . mkdirSync ( CHROME_EXTENSIONS_DIR , { recursive : true } )
fs . mkdirSync ( CHROME_DOWNLOADS_DIR , { recursive : true } )
// cruft directories
const ORPHANS_DIR = path . join ( DATA_DIR , 'orphans' )
const PARTIALS_DIR = path . join ( DATA_DIR , 'partials' )
const DUPLICATES_DIR = path . join ( DATA_DIR , 'duplicates' )
await fs . promises . mkdir ( ORPHANS_DIR , { recursive : true } )
await fs . promises . mkdir ( PARTIALS_DIR , { recursive : true } )
await fs . promises . mkdir ( DUPLICATES_DIR , { recursive : true } )
/********************** Config: Viewport Setup Opts ***************************/
// Config: Viewport
const DEFAULT_TIMEOUT = 20 _000
const DEFAULT_GEOLOCATION = { latitude : 59.95 , longitude : 30.31667 }
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
const DEFAULT_ASPECT_RAIO = 16 / 9 // recommended: 16:9 (most common desktop window aspect ratio)
const SCREENSHOT_ASPECT_RATIO = 4 / 3 // recommended: 4:3 (easier to use as thumbnails when square-ish)
const DEFAULT_WINDOW_WIDTH = 1920 // recommended: 1920x1080p (1080p screenshots)
const DEFAULT_WINDOW_HEIGHT = Math . floor ( DEFAULT_WINDOW_WIDTH / DEFAULT_ASPECT_RAIO )
const DEFAULT_VIEWPORT = {
width : DEFAULT_WINDOW_WIDTH ,
height : DEFAULT_WINDOW_HEIGHT ,
deviceScaleFactor : 2 , // 2 gives much sharper text in screenshots/pdfs/etc but uses more CPU/GPU
isMobile : false ,
hasTouch : false ,
isLandscape : false ,
}
const DEFAULT_COLOR_SCHEME = 'light'
const DEFAULT_HEADERS = {
// requires frequent tweaking to remain undetected by cloudflare/recaptcha/etc.
// 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
// 'accept-encoding': 'gzip, deflate, br, zstd',
// 'accept-language': accept_language,
// 'cache-Control': no_cache ? 'no-cache' : '',
// 'dnt': '1',
'sec-ch-ua' : '"Google Chrome";v="122", "Not:A-Brand";v="8", "Chromium";v="122"' ,
'sec-ch-ua-mobile' : '?0' ,
'sec-ch-ua-platform' : '"macOS"' ,
'connection-rtt' : '50' ,
// 'pragma': no_cache ? 'no-cache' : '',
// 'sec-fetch-dest': 'document',
// 'sec-fetch-mode': 'navigate',
// 'sec-fetch-site': 'none',
// 'sec-fetch-user': '?1',
// // 'upgrade-insecure-requests': '1', // breaks some sites, e.g. https://www.flowradar.com/cloneables/mouse-image-trail-effect
// 'user-agent': user_agent,
}
const DEFAULT_REFERRERS = [ "https://www.google.com" , "https://www.facebook.com" , "https://www.instagram.com" ]
/****************** Config: Human Behavior Emulation **************************/
const SCROLL_LIMIT = 20 ; // e.g. 30 = 30 * (1000px/2s) => 30,000px scrolled in 60sec
const SCROLL_DELAY = 1350 ; // interval per scroll, e.g. 2000 = 2sec to travel 1 * SCROLL_DISTANCE
const SCROLL_DISTANCE = DEFAULT_VIEWPORT . height - 100 ; // make sure this is slightly less than viewport height so there is some overlap to make stitching easier
/********************** Config: URL Rewriting *********************************/
const URL_REWRITES = [
// replacements should come first
// {
// idx: 0,
// pattern: /\/\/(www\.)?x\.com/gi,
// replacement: '//$1twitter.com/',
// // TODO: scope: 'hostname',
// },
// {
// idx: 1,
// pattern: /\/\/(www\.)?twitter\.com/gi,
// replacement: '//$1nitter.net',
// // TODO: scope: 'hostname',
// },
// // blocks should come at the end
// {
// idx: 999,
// pattern: /\/\/(www\.)?notallowed\.com/gi,
// replacement: '',
// // TODO: scope: 'href',
// },
]
const URL_SCHEMES_IGNORED = [
'' , // no scheme is also invalid (e.g. opening a new tab page without any url yet)
'chrome' ,
'chrome-extension' ,
'chrome-untrusted' ,
'file' ,
'data' ,
'about' ,
]
/**************** Load existing data/archive/<timestamp> snapshots *************/
const snapshots = await Snapshot . findAll ( { attributes : [ 'id' , 'timestamp' , 'url' ] } ) // include: { model: ArchiveResult, as: 'archiveresults' }, });
const results = await ArchiveResult . findAll ( { attributes : [ 'id' , 'snapshot_id' , 'extractor' , 'start_ts' ] } ) // include: { model: Snapshot, as: 'snapshot' }, });
globalThis . snapshots = snapshots
globalThis . results = results
console . log ( ` [💿] Found ${ snapshots . length } existing snapshots in index.sqlite3... ` )
console . log ( ` [💿] Found ${ results . length } existing results in index.sqlite3... ` )
// debugger;
const locateExistingSnapshots = ( archive_dir ) = > {
const urls_to_dirs = { }
// for each data/archive/<timestamp>/index.json found, store {url: data/archive/<timestamp>}
for ( const snapshot_dir of fs . readdirSync ( archive_dir ) ) {
const snapshot_json = path . join ( archive_dir , snapshot_dir , 'index.json' )
if ( fs . existsSync ( snapshot_json ) ) {
const { url , archive_path } = JSON . parse ( fs . readFileSync ( snapshot_json , 'utf-8' ) )
if ( ! snapshot_dir . includes ( archive_path . replace ( 'archive/' , '' ) ) )
throw 'Found incorrect index.json inside snapshot dir' + snapshot_dir
if ( url && url . includes ( '://' ) ) {
urls_to_dirs [ url ] = path . join ( archive_dir , snapshot_dir )
}
}
}
return urls_to_dirs
}
let SNAPSHOT_DIRS_BY_URL = locateExistingSnapshots ( ARCHIVE_DIR )
let all_snap_dirs = ( await fs . promises . readdir ( ARCHIVE_DIR ) )
// const orphan_snap_dirs = all_snap_dirs.filter(dirname => dirname.startsWith('19999'))
// // scan through existing snapshot dirs, move orphans to orphans/ or correct archive/<snapid>
// for (const snap_id of orphan_snap_dirs) {
// if (snap_id.startsWith('.')) continue
// const src_dir = path.join(ARCHIVE_DIR, snap_id)
// let src_path = src_dir
// assert((await fs.promises.stat(src_dir)).isDirectory())
// let dest_path = null
// const orphan_metrics_path = path.join(src_dir, 'metrics.json')
// if (fs.existsSync(orphan_metrics_path)) {
// const orphan_metrics = JSON.parse(await fs.promises.readFile(orphan_metrics_path, 'utf-8'))
// const url = orphan_metrics.url || orphan_metrics.URL
// const version = orphan_metrics.VERSION || versionStrFromDate(orphan_metrics.start_time)
// // move all bare files into ./versions/YYYYMMDD/* and symlink ./* to latest version
// await symlinkBestSnapshotResults(src_dir)
// dest_path = SNAPSHOT_DIRS_BY_URL[url]
// const dest_id = dest_path?.split('/').at(-1)
// if (dest_id && (dest_id != snap_id)) {
// if (fs.existsSync(dest_path)) {
// console.log(` - moving duplicate snap_dir ${src_dir} -> ${dest_path}`)
// } else {
// console.log(` - moving valid snap_dir ${src_dir} -> ${dest_path}`)
// }
// } else if (dest_id == snap_id) {
// continue
// } else {
// dest_path = path.join(ORPHANS_DIR, snap_id)
// console.log(` - moving orphan snap_dir ${src_dir} -> ${dest_path}`)
// }
// } else {
// // corrupt/par
// dest_path = path.join(PARTIALS_DIR, snap_id)
// console.log(` - moving parial snap_dir ${src_dir} -> ${dest_path}`)
// }
// if (dest_path) {
// for (const version_dir of (await fs.promises.readdir(path.join(src_path, 'versions')))) {
// const version_src = path.join(src_path, 'versions', version_dir)
// const version_dst = path.join(dest_path, 'versions', version_dir)
// // move all bare files into ./versions/YYYYMMDD/* and symlink ./* to latest version
// await symlinkBestSnapshotResults(dest_path)
// assert(!fs.existsSync(version_dst))
// await fs.promises.rename(version_src, version_dst)
// console.log(' - ', version_src, '--->', version_dst)
// }
// await fs.promises.rename(src_dir, path.join(PARTIALS_DIR, snap_id))
// await symlinkBestSnapshotResults(dest_path)
// }
// }
// const duplicate_snap_dirs = (await fs.promises.readdir(DUPLICATES_DIR)).filter(dirname => dirname.startsWith('19999'))
// for (const snap_id of duplicate_snap_dirs) {
// const src_dir = path.join(DUPLICATES_DIR, snap_id)
// const metrics = JSON.parse(await fs.promises.readFile(path.join(src_dir, 'metrics.json'), 'utf-8'))
// }
// all_snap_dirs = (await fs.promises.readdir(ARCHIVE_DIR))
// for (const snap_id of all_snap_dirs) {
// if (snap_id.startsWith('.')) continue
// const snap_dir = path.join(ARCHIVE_DIR, snap_id)
// const metrics_path = path.join(snap_dir, 'metrics.json')
// if (fs.existsSync(metrics_path)) {
// // console.log(' - updating snap_dir', snap_dir)
// await symlinkBestSnapshotResults(snap_dir)
// }
// }
// SNAPSHOT_DIRS_BY_URL = locateExistingSnapshots(ARCHIVE_DIR)
fs . writeFileSync ( path . join ( DATA_DIR , 'queue.csv' ) , '' )
const snapIdFromDir = ( dir_path ) = >
dir_path . split ( '/archive/' ) . at ( - 1 )
const snapshot_dir_list = (
Object . entries ( SNAPSHOT_DIRS_BY_URL )
. sort ( ( [ _ak , a ] , [ _bk , b ] ) = >
Number ( snapIdFromDir ( b ) ) - Number ( snapIdFromDir ( a ) ) )
. reverse ( ) )
for ( const [ existing_url , snapshot_dir ] of snapshot_dir_list ) {
// if (existing_url.startsWith('https://www.facebook.com/')) {
const is_desired_url = ! ( existing_url . includes ( 'facebook.com/' ) || existing_url . includes ( 'instagram.com/' ) )
const already_archived = false // fs.existsSync(path.join(SNAPSHOT_DIRS_BY_URL[existing_url], 'versions'))
if ( is_desired_url && ! already_archived ) {
// URLS.push(existing_url)
fs . appendFileSync (
path . join ( DATA_DIR , 'queue.csv' ) ,
` ${ SNAPSHOT_DIRS_BY_URL [ existing_url ] } , ${ existing_url } \ n ` ,
'utf-8' ,
)
}
}
URLS = [ . . . new Set ( URLS ) ]
console . log ( '[+] Added' , URLS . length , 'existing urls to queue...' )
/********************** Config: Output Paths **********************************/
// const TASK_PATH = (url) => path.join(DATA_DIR, 'results', `${hashCode(url)}`)
const TASK_PATH = ( url ) = > SNAPSHOT_DIRS_BY_URL [ url ] || path . join ( ARCHIVE_DIR , ` 1999999999. ${ hashCode ( url ) } ` )
// const TASK_PATH = (url) => {
// const existing_snap_dir = SNAPSHOT_DIRS_BY_URL[url]
// assert(existing_snap_dir, `Could not find existing snapshot dir for ${url}`)
// return existing_snap_dir
// }
const OUTPUT_PATH = ( page , filename , extname = '' ) = >
path . join ( TASK_PATH ( page . _original_url ) , ` ${ filename } ${ extname } ` )
const SSL_PATH = ( page ) = > OUTPUT_PATH ( page , 'ssl.json' )
const CONSOLELOG_PATH = ( page ) = > OUTPUT_PATH ( page , 'console.log' )
const HEADERS_PATH = ( page ) = > OUTPUT_PATH ( page , 'headers.json' )
const REDIRECTS_PATH = ( page ) = > OUTPUT_PATH ( page , 'redirects.json' )
const REQUESTS_PATH = ( page ) = > OUTPUT_PATH ( page , 'requests.json' )
const TRACE_PATH = ( page ) = > OUTPUT_PATH ( page , 'trace.json' )
const METRICS_PATH = ( page ) = > OUTPUT_PATH ( page , 'metrics.json' )
const OUTLINKS_PATH = ( page ) = > OUTPUT_PATH ( page , 'outlinks.json' )
const SEO_PATH = ( page ) = > OUTPUT_PATH ( page , 'seo.json' )
const FAVICON_PATH = ( page ) = > OUTPUT_PATH ( page , 'favicon.json' )
const TITLE_PATH = ( page ) = > OUTPUT_PATH ( page , 'title.txt' )
const BODYTEXT_PATH = ( page ) = > OUTPUT_PATH ( page , 'body.txt' )
const PANDOC_PATH = ( page ) = > OUTPUT_PATH ( page , 'pandoc.md' )
const READABILITY_PATH = ( page ) = > OUTPUT_PATH ( page , 'readability.json' )
const ACCESIBILITY_PATH = ( page ) = > OUTPUT_PATH ( page , 'accessibility.json' )
const DOM_PATH = ( page ) = > OUTPUT_PATH ( page , 'dom.html' )
const PDF_PATH = ( page ) = > OUTPUT_PATH ( page , 'output.pdf' )
const SCREENSHOT_PATH = ( page ) = > OUTPUT_PATH ( page , 'screenshot.png' )
const SCREENSHOT_JPG_PATH = ( page ) = > OUTPUT_PATH ( page , 'screenshot.jpg' )
const AIQA_PATH = ( page ) = > OUTPUT_PATH ( page , 'aiqa.json' )
const SINGLEFILE_PATH = ( page ) = > OUTPUT_PATH ( page , 'singlefile.html' )
const YTDLP_PATH = ( page ) = > OUTPUT_PATH ( page , 'media/' )
const GALLERYDL_PATH = ( page ) = > OUTPUT_PATH ( page , 'photos/' )
const SCREENRECORDING_PATH = ( page ) = > OUTPUT_PATH ( page , 'screenrecording.mp4' )
const SCREENRECORDGIF_PATH = ( page ) = > OUTPUT_PATH ( page , 'screenrecording.gif' )
const RESPONSES_PATH = ( page ) = > OUTPUT_PATH ( page , 'responses' )
const RAW_PATH = ( page ) = > OUTPUT_PATH ( page , 'raw' )
/********************** Config: Chrome Extensions *****************************/
interface ChromeExtension {
name : string
webstore_id : string
}
interface LoadedChromeExtension extends ChromeExtension {
id? : string
webstore_url? : string
crx_url? : string
crx_path? : string
unpacked_path? : string
read_manifest ? : ( ) = > any
read_version ? : ( ) = > string | null
}
const CHROME_EXTENSIONS : LoadedChromeExtension [ ] = [
// Content access / unblocking / blocking plugins
2025-12-30 16:12:53 -08:00
{ webstore_id : 'ifibfemgeogfhoebkmokieepdoobkbpo' , name : 'twocaptcha' } , // https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
2025-12-24 20:09:51 -08:00
{ webstore_id : 'edibdbjcniadpccecjdfdjjppcpchdlm' , name : 'istilldontcareaboutcookies' } ,
{ webstore_id : 'cjpalhdlnbpafiamejdnhcphjbkeiagm' , name : 'ublock' } ,
// {webstore_id: 'mlomiejdfkolichcflejclcbmpeaniij', name: 'ghostery'},
// {webstore_id: 'mnjggcdmjocbbbhaepdhchncahnbgone', name: 'sponsorblock'},
// {webstore_id: 'iplffkdpngmdjhlpjmppncnlhomiipha', name: 'unpaywall'},
// {webstore_id: 'gofocbepaccnkpphbgjpolififgcakhn', name: 'spaywallnews'},
// Archiving plugins
{ webstore_id : 'mpiodijhokgodhhofbcjdecpffjipkle' , name : 'singlefile' } ,
// {webstore_id: 'fpeoodllldobpkbkabpblcfaogecpndd', name: 'archivewebpage'},
// {webstore_id: 'niloccemoadcdkdjlinkgdfekeahmflj', name: 'pocket'},
// {webstore_id: 'kenncghfghgolcbmckhiljgaabnpcaaa', name: 'warcreate'},
// {webstore_id: 'jjndjgheafjngoipoacpjgeicjeomjli', name: 'puppeteerstream'},
// Utilities for humans setting up/viewing/debugging the archiving session
// {webstore_id: 'aeblfdkhhhdcdjpifhhbdiojplfjncoa', name: '1password'},
// {webstore_id: 'fngmhnnpilhplaeedifhccceomclgfbg', name: 'editthiscookie'},
// {webstore_id: 'cgfpgnepljlgenjclbekbjdlgcodfmjp', name: 'simpletabsorter'},
// Scripting/automation plugins
// {webstore_id: 'jinjaccalgkegednnccohejagnlnfdag', name: 'violentmonkey'},
// {webstore_id: 'infppggnoaenmfagbfknfkancpbljcca', name: 'automa'},
// {webstore_id: 'pfegffhjcgkneoemnlniggnhkfioidjg', name: 'screenscraper'},
]
/******************** Config: Chrome Profile Preferences **********************/
// https://niek.github.io/chrome-features/
const CHROME_DISABLED_COMPONENTS = [
'Translate' ,
'AcceptCHFrame' ,
'OptimizationHints' ,
'ProcessPerSiteUpToMainFrameThreshold' ,
'InterestFeedContentSuggestions' ,
'CalculateNativeWinOcclusion' ,
'BackForwardCache' ,
'HeavyAdPrivacyMitigations' ,
'LazyFrameLoading' ,
'ImprovedCookieControls' ,
'PrivacySandboxSettings4' ,
'AutofillServerCommunication' ,
'CertificateTransparencyComponentUpdater' ,
'DestroyProfileOnBrowserClose' ,
'CrashReporting' ,
'OverscrollHistoryNavigation' ,
'InfiniteSessionRestore' ,
//'LockProfileCookieDatabase', // disabling allows multiple chrome instances to concurrently modify profile, but might make chrome much slower https://github.com/yt-dlp/yt-dlp/issues/7271 https://issues.chromium.org/issues/40901624
]
const CHROME_PREFERENCES_EXTRA = { }
const CHROME_PREFERENCES_DEFAULT = {
// https://chromium.googlesource.com/chromium/src/+/32352ad08ee673a4d43e8593ce988b224f6482d3/chrome/common/pref_names.cc
homepage : 'about:blank' , // doesn't work here, managed by Secure Preferences
homepage_is_newtabpage : false , // doesn't work here, managed by Secure Preferences
session : { // doesn't work here, managed by Secure Preferences
restore_on_startup : 4 , // doesn't work here, managed by Secure Preferences
startup_urls : 'about:blank' , // doesn't work here, managed by Secure Preferences
} ,
default_apps : 'noinstall' ,
browser : {
confirm_to_quit : false ,
enable_spellchecking : false ,
check_default_browser : false ,
show_update_promotion_info_bar : false ,
} ,
profile : {
// name: 'ArchiveBox Persona: Default', // doesnt work to change display name, not sure why
// using_default_name: false,
exited_cleanly : true ,
default_content_setting_values : {
automatic_downloads : 1 ,
} ,
} ,
bookmark_bar : { show_on_all_tabs : false } ,
safebrowsing : { enabled : false } ,
search : { suggest_enabled : false } ,
download : {
prompt_for_download : false ,
open_pdf_in_system_reader : true ,
// default_directory: CHROME_DOWNLOADS_DIR || path.join(__dirname, 'downloads'),
} ,
select_file_dialogs : { allowed : false } ,
autofill : { save_data : false } ,
printing : { enabled : false } ,
message_center : { welcome_notification_dismissed_local : true } ,
extensions : {
ui : {
developer_mode : true ,
dismissed_adt_promo : true ,
} ,
// pinned_extensions: CHROME_EXTENSIONS?.map(({id}) => id) || [],
} ,
webkit : {
webprefs : {
javascript_enabled : true ,
minimum_font_size : 9 ,
// default_font_size: 12,
// web_security_enabled: false,
// allow_displaying_insecure_content: true,
// allow_running_insecure_content: true,
java_enabled : true ,
loads_images_automatically : true ,
} ,
} ,
settings : {
multi_profile_never_show_intro : true ,
multi_profile_warning_show_dismissed : true ,
first_run_tutorial_shown : true ,
} ,
plugins : {
always_open_pdf_externally : true ,
} ,
}
const CHROME_PREFERENCES_PATH = path . join ( CHROME_PROFILE_PATH , 'Default' , 'Preferences' )
const getChromePreferences = ( { CHROME_PREFERENCES_DEFAULT , CHROME_PREFERENCES_EXTRA , CHROME_EXTENSIONS , CHROME_DOWNLOADS_DIR } ) = >
merge . all ( [ CHROME_PREFERENCES_DEFAULT , CHROME_PREFERENCES_EXTRA , {
extensions : {
pinned_extensions : CHROME_EXTENSIONS?.map ( ( { id } ) = > id ) || [ ] ,
} ,
download : {
default_directory : CHROME_DOWNLOADS_DIR || path . join ( __dirname , 'downloads' ) ,
} ,
} ] )
function applyChromePreferences ( puppeteer , prefs_path , preferences ) {
if ( fs . existsSync ( prefs_path ) ) {
const preferences_existing = JSON . parse ( fs . readFileSync ( prefs_path , 'utf-8' ) )
const preferences_merged = merge ( preferences_existing , preferences )
// console.log(JSON.stringify(preferences_merged, null, 4))
fs . writeFileSync ( prefs_path , JSON . stringify ( preferences_merged ) )
} else {
// otherwise profile has not been created yet, use plugin instead (plugin only works on first creation)
puppeteer . use ( PrefsPlugin ( { userPrefs : preferences } ) )
}
return puppeteer
}
/******************** Config: Chrome Launch Args ******************************/
const CHROME_ARGS_DEFAULT = [
// Headless behavior tuning, determinstic behavior settings
// '--headless=new',
'--test-type' ,
'--test-type=gpu' , // https://github.com/puppeteer/puppeteer/issues/10516
'--deterministic-mode' ,
'--js-flags=--random-seed=1157259159' , // make all JS random numbers deterministic by providing a seed
'--allow-pre-commit-input' , // allow JS mutations before page rendering is complete
'--disable-blink-features=AutomationControlled' , // hide the signatures that announce browser is being remote-controlled
'--enable-automation' , // <- DONT USE THIS, it makes you easily detectable / blocked by cloudflare
// `--proxy-server=https://43.159.28.126:2334:u7ce652b7568805c4-zone-custom-region-us-session-szGWq3FRU-sessTime-60:u7ce652b7568805c4`, // send all network traffic through a proxy https://2captcha.com/proxy
// `--proxy-bypass-list=127.0.0.1`,
// Docker-specific options
// https://github.com/GoogleChrome/lighthouse-ci/tree/main/docs/recipes/docker-client#--no-sandbox-issues-explained
// '--no-sandbox', // rely on docker sandboxing in docker, otherwise we need cap_add: SYS_ADM to use host sandboxing
// '--disable-gpu-sandbox',
// '--disable-setuid-sandbox',
// '--disable-dev-shm-usage', // docker 75mb default shm size is not big enough, disabling just uses /tmp instead
// '--no-xshm',
// Profile data dir setup
// chrome://profile-internals
` --user-data-dir= ${ CHROME_PROFILE_PATH } ` ,
` --profile-directory= ${ CHROME_PROFILE_USER } ` ,
'--password-store=basic' , // use mock keychain instead of OS-provided keychain (we manage auth.json instead)
'--use-mock-keychain' ,
'--disable-cookie-encryption' , // we need to be able to write unencrypted cookies to save/load auth.json
// '--disable-sync', // don't try to use Google account sync features
// Extensions
// chrome://inspect/#extensions
// `--load-extension=${CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}`, // not needed when using existing profile that already has extensions installed
` --allowlisted-extension-id= ${ CHROME_EXTENSIONS . map ( ( { webstore_id } ) => webstore_id).join(',')} ` ,
'--allow-legacy-extension-manifests' ,
// Browser window and viewport setup
// chrome://version
// `--user-agent="${DEFAULT_USER_AGENT}"`,
// `--window-size=${DEFAULT_VIEWPORT.width},${DEFAULT_VIEWPORT.height}`,
'--window-position=0,0' ,
'--hide-scrollbars' , // hide scrollbars because otherwise they show up in screenshots
'--install-autogenerated-theme=169,32,85' , // red border makes it easier to see which chrome window is archivebox's
'--autoplay-policy=no-user-gesture-required' , // auto-start videos so they trigger network requests + show up in outputs
'--disable-gesture-requirement-for-media-playback' ,
'--lang=en-US,en;q=0.9' ,
// DANGER: JS isolation security features (to allow easier tampering with pages during archiving)
// chrome://net-internals
// '--disable-web-security', // <- WARNING, breaks some sites that expect/enforce strict CORS headers (try webflow.com)
// '--disable-features=IsolateOrigins,site-per-process', // useful for injecting JS, but some very strict sites can panic / show error pages when isolation is disabled (e.g. webflow.com)
// '--allow-running-insecure-content', // Breaks CORS/CSRF/HSTS etc., useful sometimes but very easy to detect
// '--allow-file-access-from-files', // <- WARNING, dangerous, allows JS to read filesystem using file:// URLs
// // DANGER: Disable HTTPS verification
// '--ignore-certificate-errors',
// '--ignore-ssl-errors',
// '--ignore-certificate-errors-spki-list',
// '--allow-insecure-localhost',
// IO: stdin/stdout, debug port config
// chrome://inspect
'--log-level=2' , // 1=DEBUG 2=WARNING 3=ERROR
'--enable-logging=stderr' ,
'--remote-debugging-address=0.0.0.0' ,
` --remote-debugging-port= ${ CHROME_DEBUG_PORT } ` ,
// GPU, canvas, text, and pdf rendering config
// chrome://gpu
'--enable-webgl' , // enable web-gl graphics support
'--font-render-hinting=none' , // make rendering more deterministic by ignoring OS font hints, may also need css override, try: * {text-rendering: geometricprecision !important; -webkit-font-smoothing: antialiased;}
'--force-color-profile=srgb' , // make rendering more deterministic by using consitent color profile, if browser looks weird, try: generic-rgb
'--disable-partial-raster' , // make rendering more deterministic (TODO: verify if still needed)
'--disable-skia-runtime-opts' , // make rendering more deterministic by avoiding Skia hot path runtime optimizations
'--disable-2d-canvas-clip-aa' , // make rendering more deterministic by disabling antialiasing on 2d canvas clips
// '--disable-gpu', // falls back to more consistent software renderer
// // '--use-gl=swiftshader', <- DO NOT USE, breaks M1 ARM64. it makes rendering more deterministic by using simpler CPU renderer instead of OS GPU renderer bug: https://groups.google.com/a/chromium.org/g/chromium-dev/c/8eR2GctzGuw
// // '--disable-software-rasterizer', <- DO NOT USE, harmless, used in tandem with --disable-gpu
// // '--run-all-compositor-stages-before-draw', <- DO NOT USE, makes headful chrome hang on startup (tested v121 Google Chrome.app on macOS)
// // '--disable-gl-drawing-for-tests', <- DO NOT USE, disables gl output (makes tests run faster if you dont care about canvas)
// // '--blink-settings=imagesEnabled=false', <- DO NOT USE, disables images entirely (only sometimes useful to speed up loading)
// Process management & performance tuning
// chrome://process-internals
'--disable-lazy-loading' , // make rendering more deterministic by loading all content up-front instead of on-focus
'--disable-renderer-backgrounding' , // dont throttle tab rendering based on focus/visibility
'--disable-background-networking' , // dont throttle tab networking based on focus/visibility
'--disable-background-timer-throttling' , // dont throttle tab timers based on focus/visibility
'--disable-backgrounding-occluded-windows' , // dont throttle tab window based on focus/visibility
'--disable-ipc-flooding-protection' , // dont throttle ipc traffic or accessing big request/response/buffer/etc. objects will fail
'--disable-extensions-http-throttling' , // dont throttle http traffic based on runtime heuristics
'--disable-field-trial-config' , // disable shared field trial state between browser processes
'--disable-back-forward-cache' , // disable browsing navigation cache
// '--in-process-gpu', <- DONT USE THIS, makes headful startup time ~5-10s slower (tested v121 Google Chrome.app on macOS)
// '--disable-component-extensions-with-background-pages', // TODO: check this, disables chrome components that only run in background (could lower startup time)
// uncomment to disable hardware camera/mic/speaker access + present fake devices to websites
// (faster to disable, but disabling breaks recording browser audio in puppeteer-stream screenrecordings)
// '--use-fake-device-for-media-stream',
// '--use-fake-ui-for-media-stream',
// '--disable-features=GlobalMediaControls,MediaRouter,DialMediaRouteProvider',
// // Output format options (PDF, screenshot, etc.)
'--export-tagged-pdf' , // include table on contents and tags in printed PDFs
'--generate-pdf-document-outline' ,
// Suppress first-run features, popups, hints, updates, etc.
// chrome://system
'--no-pings' ,
'--no-first-run' ,
'--no-default-browser-check' ,
'--disable-default-apps' ,
'--ash-no-nudges' ,
'--disable-infobars' ,
'--disable-search-engine-choice-screen' ,
'--disable-session-crashed-bubble' ,
'--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"' ,
'--hide-crash-restore-bubble' ,
'--suppress-message-center-popups' ,
'--disable-client-side-phishing-detection' ,
'--disable-domain-reliability' ,
'--disable-component-update' ,
'--disable-datasaver-prompt' ,
'--disable-hang-monitor' ,
'--disable-session-crashed-bubble' ,
'--disable-speech-synthesis-api' ,
'--disable-speech-api' ,
'--disable-print-preview' ,
'--safebrowsing-disable-auto-update' ,
'--deny-permission-prompts' ,
'--disable-external-intent-requests' ,
'--disable-notifications' ,
'--disable-desktop-notifications' ,
'--noerrdialogs' ,
'--disable-popup-blocking' ,
'--disable-prompt-on-repost' ,
'--silent-debugger-extension-api' ,
'--block-new-web-contents' ,
'--metrics-recording-only' ,
'--disable-breakpad' ,
// other feature flags
// chrome://flags chrome://components
` --disable-features= ${ CHROME_DISABLED_COMPONENTS . join ( ',' ) } ` ,
'--enable-features=NetworkService' ,
]
const CHROME_ARGS_EXTRA = [ ]
const CHROME_LAUNCH_OPTIONS = {
CHROME_PROFILE_PATH ,
CHROME_PROFILE_USER ,
CHROME_EXTENSIONS ,
CHROME_DEBUG_PORT ,
CHROME_DISABLED_COMPONENTS ,
DEFAULT_VIEWPORT ,
CHROME_ARGS_DEFAULT ,
CHROME_ARGS_EXTRA ,
}
/ * C h r o m e C L I A r g s D o c u m e n t a t i o n
- https : //github.com/GoogleChrome/chrome-launcher/blob/main/docs/chrome-flags-for-tools.md
- https : //chromium.googlesource.com/chromium/chromium/+/master/content/public/common/content_switches.cc
- https : //jtway.co/optimize-your-chrome-options-for-testing-to-get-x1-25-impact-4f19f071bf45
- https : //peter.sh/experiments/chromium-command-line-switches/
- https : //www.chromium.org/developers/how-tos/run-chromium-with-flags/
- https : //github.com/manoj9788/Chrome-Driver-arguments/blob/master/README.md
* /
const getChromeArgs = ( { CHROME_ARGS_DEFAULT , CHROME_ARGS_EXTRA ,
CHROME_PROFILE_PATH , CHROME_PROFILE_USER ,
CHROME_EXTENSIONS ,
CHROME_DEBUG_PORT ,
CHROME_DISABLED_COMPONENTS ,
DEFAULT_VIEWPORT } = CHROME_LAUNCH_OPTIONS ) = >
[
. . . CHROME_ARGS_DEFAULT ,
` --user-data-dir= ${ CHROME_PROFILE_PATH } ` ,
` --profile-directory= ${ CHROME_PROFILE_USER } ` ,
` --load-extension= ${ CHROME_EXTENSIONS . map ( ( { unpacked_path } ) => unpacked_path).join(',')} ` ,
` --allowlisted-extension-id= ${ CHROME_EXTENSIONS . map ( ( { id } ) => id).join(',')} ` ,
` --window-size= ${ DEFAULT_VIEWPORT . width } , ${ DEFAULT_VIEWPORT . height } ` ,
` --remote-debugging-port= ${ CHROME_DEBUG_PORT } ` ,
` --disable-features= ${ CHROME_DISABLED_COMPONENTS . join ( ',' ) } ` ,
. . . CHROME_ARGS_EXTRA ,
]
/******************** Chrome Extension Management *****************************/
function getExtensionId ( unpacked_path ) {
const manifest_path = path . join ( unpacked_path , 'manifest.json' )
if ( ! fs . existsSync ( manifest_path ) ) return null
// chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id
const hash = crypto . createHash ( 'sha256' ) ;
hash . update ( Buffer . from ( unpacked_path , 'utf-8' ) ) ;
const detected_extension_id = Array . from ( hash . digest ( 'hex' ) )
. slice ( 0 , 32 ) // Convert each hexadecimal character to a character in the range 'a'-'p'
. map ( i = > String . fromCharCode ( parseInt ( i , 16 ) + 'a' . charCodeAt ( 0 ) ) )
. join ( '' ) ;
return detected_extension_id
}
async function installExtension ( extension ) {
const manifest_path = path . join ( extension . unpacked_path , 'manifest.json' )
// Download extensions using:
// curl -fsSL 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D$EXTENSION_ID%26uc' > extensionname.crx
// unzip -d extensionname extensionname.zip
if ( ! fs . existsSync ( manifest_path ) && ! fs . existsSync ( extension . crx_path ) ) {
console . log ( "[🛠️] Downloading missing extension" , extension . name , extension . webstore_id , '->' , extension . crx_path ) ;
// Download crx file from ext.crx_url -> ext.crx_path
const response = await fetch ( extension . crx_url ) as Response
const crx_file = fs . createWriteStream ( extension . crx_path ) ;
if ( response . headers . get ( "content-length" ) && response . body ) {
// @ts-ignore
const crx_stream = Readable . fromWeb ( response . body )
await finished ( crx_stream . pipe ( crx_file ) )
} else {
console . warn ( '[⚠️] Failed to download extension' , extension . name , extension . webstore_id )
}
}
var { stdout , stderr } = { stdout : '' , stderr : '' }
// Unzip crx file from ext.crx_url -> ext.unpacked_path
await fs . promises . mkdir ( extension . unpacked_path , { recursive : true } )
try {
var { stdout , stderr } = await exec ( ` /usr/bin/unzip ${ extension . crx_path } -d ${ extension . unpacked_path } ` )
} catch ( err1 ) {
try {
await unzip ( extension . crx_path , extension . unpacked_path )
} catch ( err2 ) {
// console.error(`[❌] Failed to install ${extension.crx_path}: could not unzip crx`, err1, err2)
// return false
}
}
if ( ! fs . existsSync ( manifest_path ) )
console . error ( ` [❌] Failed to install ${ extension . crx_path } : could not find manifest.json in unpacked_path ` , stdout , stderr )
return fs . existsSync ( manifest_path )
}
async function loadOrInstallExtension ( ext ) {
if ( ! ( ext . webstore_id || ext . unpacked_path ) )
throw 'Extension must have either {webstore_id} or {unpacked_path}'
// Set statically computable extension metadata
ext . webstore_id = ext . webstore_id || ext . id
ext . name = ext . name || ext . webstore_id
ext . webstore_url = ext . webstore_url || ` https://chromewebstore.google.com/detail/ ${ ext . webstore_id } `
ext . crx_url = ext . crx_url || ` https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D ${ ext . webstore_id } %26uc `
ext . crx_path = ext . crx_path || path . join ( CHROME_EXTENSIONS_DIR , ` ${ ext . webstore_id } __ ${ ext . name } .crx ` )
ext . unpacked_path = ext . unpacked_path || path . join ( CHROME_EXTENSIONS_DIR , ` ${ ext . webstore_id } __ ${ ext . name } ` )
const manifest_path = path . join ( ext . unpacked_path , 'manifest.json' )
ext . read_manifest = ( ) = > JSON . parse ( fs . readFileSync ( manifest_path , 'utf-8' ) )
ext . read_version = ( ) = > fs . existsSync ( manifest_path ) && ext . read_manifest ( ) ? . version || null
// if extension is not installed, download and unpack it
if ( ! ext . read_version ( ) ) {
await installExtension ( ext )
}
// autodetect id from filesystem path (unpacked extensions dont have stable IDs)
ext . id = getExtensionId ( ext . unpacked_path )
ext . version = ext . read_version ( )
if ( ! ext . version ) {
console . warn ( '[❌] Unable to detect ID and version of installed extension' , prettyPath ( ext . unpacked_path ) )
} else {
console . log ( ` [➕ ] Installed extension ${ ext . name } ( ${ ext . version } )... ` . padEnd ( 82 ) , prettyPath ( ext . unpacked_path ) )
}
return ext
}
async function isTargetExtension ( target ) {
let target_type
let target_ctx
let target_url
try {
target_type = target . type ( )
target_ctx = ( await target . worker ( ) ) || ( await target . page ( ) ) || null
target_url = target . url ( ) || target_ctx ? . url ( ) || null
} catch ( err ) {
if ( String ( err ) . includes ( 'No target with given id found' ) ) {
// because this runs on initial browser startup, we sometimes race with closing the initial
// new tab page. it will throw a harmless error if we try to check a target that's already closed,
// ignore it and return null since that page is definitely not an extension's bg page anyway
target_type = 'closed'
target_ctx = null
target_url = 'about:closed'
} else {
throw err
}
}
const target_is_bg = [ 'service_worker' , 'background_page' ] . includes ( target_type )
const target_is_extension = target_url ? . startsWith ( 'chrome-extension://' )
const extension_id = ( target_is_extension && target_url . split ( '://' ) [ 1 ] . split ( '/' ) [ 0 ] ) || null
const manifest_version = target_type === 'service_worker' ? '3' : '2'
return {
target_type ,
target_ctx ,
target_url ,
target_is_bg ,
target_is_extension ,
extension_id ,
manifest_version ,
}
}
async function loadExtensionFromTarget ( extensions , target ) {
const {
target_is_bg ,
target_is_extension ,
target_type ,
target_ctx ,
target_url ,
extension_id ,
manifest_version ,
} = await isTargetExtension ( target )
if ( ! ( target_is_bg && extension_id && target_ctx ) )
return null
const manifest = await target_ctx . evaluate ( ( ) = >
// @ts-ignore
chrome . runtime . getManifest ( ) )
const { name , version , homepage_url , options_page , options_ui } = manifest
if ( ! version || ! extension_id )
return null
const options_url = await target_ctx . evaluate (
( options_page ) = > chrome . runtime . getURL ( options_page ) ,
options_page || options_ui ? . page || 'options.html' ,
)
const commands = await target_ctx . evaluate ( async ( ) = >
( await new Promise ( ( resolve , reject ) = > {
if ( chrome . commands )
chrome . commands . getAll ( resolve )
else
resolve ( { } )
} ) )
)
// console.log(`[+] Found Manifest V${manifest_version} Extension:`, extension_id, name, target_url, Object.keys(commands).length)
let dispatchEval = async ( . . . args ) = >
await target_ctx . evaluate ( . . . args )
let dispatchPopup = async ( ) = >
await target_ctx . evaluate ( 'chrome.action?.openPopup() || chrome.tabs.create({url: chrome.runtime.getURL("popup.html")})' )
let dispatchAction
let dispatchMessage
let dispatchCommand
if ( manifest_version === '3' ) {
dispatchAction = async ( tab ) = > {
// https://developer.chrome.com/docs/extensions/reference/api/action#event-onClicked
return await target_ctx . evaluate ( async ( tab ) = > {
tab = tab || ( await new Promise ( ( resolve ) = >
chrome . tabs . query ( { currentWindow : true , active : true } , ( [ tab ] ) = > resolve ( tab ) ) ) )
// @ts-ignore
return await chrome . action . onClicked . dispatch ( tab )
} , tab )
}
dispatchMessage = async ( message , options ) = > {
// https://developer.chrome.com/docs/extensions/reference/api/runtime
return await target_ctx . evaluate ( async ( extension_id , message , options ) = > {
return await chrome . runtime . sendMessage ( extension_id , message , options )
} , extension_id , message , options )
}
dispatchCommand = async ( command , tab ) = > {
// https://developer.chrome.com/docs/extensions/reference/api/commands#event-onCommand
return await target_ctx . evaluate ( async ( command , tab ) = > {
// @ts-ignore
return await chrome . commands . onCommand . dispatch ( command , tab )
} , command , tab )
}
} else if ( manifest_version === '2' ) {
dispatchAction = async ( tab ) = > {
// https://developer.chrome.com/docs/extensions/mv2/reference/browserAction#event-onClicked
return await target_ctx . evaluate ( async ( tab ) = > {
tab = tab || ( await new Promise ( ( resolve ) = >
chrome . tabs . query ( { currentWindow : true , active : true } , ( [ tab ] ) = > resolve ( tab ) ) ) )
// @ts-ignore
return await chrome . browserAction . onClicked . dispatch ( tab )
} , tab )
}
dispatchMessage = async ( message , options ) = > {
// https://developer.chrome.com/docs/extensions/mv2/reference/runtime#method-sendMessage
return await target_ctx . evaluate ( async ( extension_id , message , options ) = > {
return await new Promise ( ( resolve ) = >
chrome . runtime . sendMessage ( extension_id , message , options , resolve )
)
} , extension_id , message , options )
}
dispatchCommand = async ( command , tab ) = > {
// https://developer.chrome.com/docs/extensions/mv2/reference/commands#event-onCommand
return await target_ctx . evaluate ( async ( command , tab ) = > {
return await new Promise ( ( resolve ) = >
// @ts-ignore
chrome . commands . onCommand . dispatch ( command , tab , resolve )
)
} , command , tab )
}
}
const existing_extension = extensions . filter ( ( { id } ) = > id === extension_id ) [ 0 ] || { }
const new_extension = {
. . . existing_extension ,
id : extension_id ,
webstore_name : name ,
target ,
target_ctx ,
target_type ,
target_url ,
manifest_version ,
manifest ,
version ,
homepage_url ,
options_url ,
dispatchEval , // run some JS in the extension's service worker context
dispatchPopup , // open the extension popup
dispatchAction , // trigger an extension menubar icon click
dispatchMessage , // send a chrome runtime message in the service worker context
dispatchCommand , // trigger an extension keyboard shortcut command
}
console . log ( ` [➕ ] Loaded extension ${ name . substring ( 0 , 32 ) } ( ${ version } ) ${ target_type } ... ` . padEnd ( 82 ) , target_url )
Object . assign ( existing_extension , new_extension )
return new_extension
}
async function getChromeExtensionsFromPersona ( { CHROME_EXTENSIONS , CHROME_EXTENSIONS_DIR } ) {
console . log ( '*************************************************************************' )
console . log ( ` [⚙️] Installing ${ CHROME_EXTENSIONS . length } chrome extensions from CHROME_EXTENSIONS... ` )
try {
// read extension metadata from filesystem (installing from Chrome webstore if extension is missing)
for ( const extension of CHROME_EXTENSIONS ) {
Object . assign ( extension , await loadOrInstallExtension ( extension ) )
}
// for easier debugging, write parsed extension info to filesystem
await overwriteFile (
CHROME_EXTENSIONS_JSON_PATH . replace ( '.json' , '.present.json' ) ,
CHROME_EXTENSIONS ,
)
} catch ( err ) {
console . error ( err )
}
console . log ( '*************************************************************************' )
return CHROME_EXTENSIONS
}
let _EXTENSIONS_CACHE = null
async function getChromeExtensionsFromCache ( { browser , extensions = CHROME_EXTENSIONS , extensions_dir = CHROME_EXTENSIONS_DIR } ) {
if ( _EXTENSIONS_CACHE === null ) {
console . log ( ` [⚙️] Loading ${ CHROME_EXTENSIONS . length } chrome extensions from CHROME_EXTENSIONS... ` )
// find loaded Extensions at runtime / browser launch time & connect handlers
// looks at all the open targets for extension service workers / bg pages
for ( const target of browser . targets ( ) ) {
// mutates extensions object in-place to add metadata loaded from filesystem persona dir
await loadExtensionFromTarget ( extensions , target )
}
_EXTENSIONS_CACHE = extensions
// write installed extension metadata to filesystem extensions.json for easier debugging
await overwriteFile (
CHROME_EXTENSIONS_JSON_PATH . replace ( '.json' , '.loaded.json' ) ,
extensions ,
)
await overwriteSymlink (
CHROME_EXTENSIONS_JSON_PATH . replace ( '.json' , '.loaded.json' ) ,
CHROME_EXTENSIONS_JSON_PATH ,
)
}
return _EXTENSIONS_CACHE
}
async function setup2CaptchaExtension ( { browser , extensions } ) {
let page = null
try {
// open a new tab to finish setting up the 2captcha extension manually using its extension options page
page = await browser . newPage ( )
2025-12-30 16:12:53 -08:00
const { options_url } = extensions . filter ( ext = > ext . name === 'twocaptcha' ) [ 0 ]
2025-12-24 20:09:51 -08:00
await page . goto ( options_url )
await wait ( 2 _500 )
await page . bringToFront ( )
// type in the API key and click the Login button (and auto-close success modal after it pops up)
await page . evaluate ( ( ) = > {
const elem = document . querySelector ( "input[name=apiKey]" ) as HTMLInputElement
elem . value = ""
} )
await page . type ( 'input[name=apiKey]' , API_KEY_2CAPTCHA , { delay : 25 } )
// toggle all the important switches to ON
await page . evaluate ( ( ) = > {
const checkboxes = Array . from ( document . querySelectorAll < HTMLInputElement > ( 'input#isPluginEnabled, input[name*=enabledFor], input[name*=autoSolve]' ) ) ;
for ( const checkbox of checkboxes ) {
if ( ! checkbox . checked ) checkbox . click ( )
}
} )
let dialog_opened = false
page . on ( 'dialog' , async ( dialog ) = > {
setTimeout ( async ( ) = > {
await dialog . accept ( ) ;
dialog_opened = true
} , 500 ) ;
} )
await page . click ( 'button#connect' )
await wait ( 2 _500 )
if ( ! dialog_opened ) {
throw ` 2captcha extension login confirmation dialog never opened, please check its options page manually: ${ options_url } `
}
console . log ( '[🔑] Configured the 2captcha extension using its options page...' )
} catch ( err ) {
console . warn ( ` [❌] Failed to configure the 2captcha extension using its options page! ` , err )
}
if ( page ) await page . close ( )
}
async function speedtest ( { browser , page , measureUpload = true , timeout = 25000 } : { browser? : Browser , page? : Page , measureUpload? : boolean , timeout? : number } ) {
// run a speedtest using fast.com, printing results once per second
browser = browser || await page . browser ( )
page = page || await browser . newPage ( )
// save one speedtest_<date>.json result per day
const today = versionStrFromDate ( new Date ( ) , { withDate : true , withTime : false } )
const SPEEDTEST_PATH = path . join ( SPEEDTESTS_DIR , ` speedtest_ ${ today } .json ` )
// check if we've already run one today, if so return earlier results and skip running again
try {
return JSON . parse ( await fs . promises . readFile ( SPEEDTEST_PATH , 'utf-8' ) )
} catch ( err ) {
// otherwise speedtest does not exist yet for today, continue onwards...
}
console . log ( '[🚤] Running Speedtest using Fast.com...' . padEnd ( 82 ) , prettyPath ( SPEEDTEST_PATH ) )
await page . goto ( 'https://fast.com' , { timeout , waitUntil : 'domcontentloaded' } ) ;
await page . waitForSelector ( '#speed-value' , { timeout } )
let result = null
let loop_idx = 0
while ( loop_idx < 100 ) {
result = await page . evaluate ( ( ) = > {
const $ = document . querySelector . bind ( document ) ;
return {
downloadSpeed : Number ( $ ( '#speed-value' ) . textContent ) ,
downloadUnit : $ ( '#speed-units' ) . textContent . trim ( ) ,
downloaded : Number ( $ ( '#down-mb-value' ) . textContent . trim ( ) ) ,
uploadSpeed : Number ( $ ( '#upload-value' ) . textContent ) ,
uploadUnit : $ ( '#upload-units' ) . textContent . trim ( ) ,
uploaded : Number ( $ ( '#up-mb-value' ) . textContent . trim ( ) ) ,
latency : Number ( $ ( '#latency-value' ) . textContent . trim ( ) ) ,
bufferBloat : Number ( $ ( '#bufferbloat-value' ) . textContent . trim ( ) ) ,
userLocation : $ ( '#user-location' ) . textContent . trim ( ) ,
userIp : $ ( '#user-ip' ) . textContent . trim ( ) ,
isDone : Boolean ( $ ( '#speed-value.succeeded' ) && $ ( '#upload-value.succeeded' ) ) ,
} ;
} )
if ( result . downloadSpeed > 0 ) {
// console.log(JSON.stringify(result).replaceAll('"', '').replaceAll(',', ' ').replaceAll('{', '').replaceAll('}', ''))
}
if ( result . isDone || ( ! measureUpload && result . uploadSpeed ) ) {
break
}
await wait ( 500 )
loop_idx ++
}
await Promise . allSettled ( [
page . close ( ) ,
overwriteFile ( SPEEDTEST_PATH , result )
] )
return result
}
/******************************************************************************/
/******************************************************************************/
const ALREADY_ARCHIVED = new Set ( [ '' , 'about:blank' , 'chrome://newtab' , 'chrome://version' ] )
const TASKS_PER_RUN_LIMIT = 200
async function botArchiveTask ( { page , data , url = '' } ) {
url = url || data // puppeteer-cluster passes in the url value via the data: arg
const is_unarchivable_url = URL_SCHEMES_IGNORED . includes ( url . split ( ':' ) [ 0 ] )
const is_already_archived = ALREADY_ARCHIVED . has ( url . slice ( 0 , 4096 ) )
if ( is_unarchivable_url || is_already_archived ) return null
ALREADY_ARCHIVED . add ( url . slice ( 0 , 4096 ) )
if ( ALREADY_ARCHIVED . size > TASKS_PER_RUN_LIMIT ) {
console . warn ( '[❌] Hit maximum URLs archived per browser session, exiting to free memory.' )
console . warn ( ' Run this process again to continue with the next batch...' )
process . exit ( 21 )
}
const browser = await page . browser ( )
const client = await page . target ( ) . createCDPSession ( )
const extensions = await getChromeExtensionsFromCache ( { browser } )
const browser_version = await browser . version ( )
const original_url = url . toString ( )
const start_time = ( new Date ( ) )
console . log ( '[0/4]-------------------------------------------------------------------------' )
const snapshot_dir = await setupSnapshotDir ( { original_url , start_time } )
const snapshot = await setupSnapshotDB ( { original_url , start_time , snapshot_dir } )
console . log ( '[1/4]-------------------------------------------------------------------------' )
console . log ( ` [🪟] Starting page & viewport setup ( ${ browser_version } ${ DEFAULT_VIEWPORT . isMobile ? 'mobile' : 'desktop' } ${ DEFAULT_VIEWPORT . width } x ${ DEFAULT_VIEWPORT . height } px)... ` )
const page_state = {
// global static state
browser ,
client ,
browser_version ,
extensions ,
// per-page static metadata
original_url ,
snapshot ,
snapshot_dir ,
start_time : start_time.toISOString ( ) ,
start_ts : Number ( start_time ) ,
version : versionStrFromDate ( start_time ) ,
// per-page mutable archiving state
main_response : null ,
recorder : null ,
console_log : [ ] ,
traffic_log : { } ,
redirects : { } ,
}
page . _original_url = original_url
try {
// run all page setup functions in parallel
const results = await Promise . allSettled ( [
// loadAuthStorage(page, page_state, { apply: true }),
startMetadataRecording ( page , page_state ) ,
setupURLRewriting ( page , page_state ) ,
// setupViewport(page, page_state),
setupModalAutoClosing ( page , page_state ) ,
loadCloudflareCookie ( page , page_state ) ,
startResponseSaving ( page , page_state ) ,
saveYTDLP ( page , page_state ) ,
saveGALLERYDL ( page , page_state ) ,
// saveSourceMaps(page, page_state),
// TODO: someday setup https://github.com/osnr/TabFS ?
] ) ;
// run all page setup functions in parallel
const rejected = results
. filter ( result = > result . status === 'rejected' )
. map ( result = > ( result as PromiseRejectedResult ) . reason ) ;
if ( rejected . length ) console . warn ( '[⚠️] Partial failures during page setup:' , rejected ) ;
} catch ( err ) {
console . error ( '[❌] PAGE SETUP ERROR' , JSON . stringify ( err , null , 4 ) )
return
}
console . log ( '[2/4]-------------------------------------------------------------------------' )
console . log ( '[➡️] NAVIGATION[INI]' , ANSI . blue + url + ANSI . reset )
const startrecording_promise = startScreenrecording ( page , page_state )
page_state . main_response = await page . goto ( url , { waitUntil : 'load' , timeout : 40_000 } )
try {
const results = await Promise . allSettled ( [
startrecording_promise ,
page . bringToFront ( ) ,
page . waitForNetworkIdle ( { concurrency : 0 , idleTime : 900 , timeout : 20_000 } ) ,
] )
const rejected = results
. filter ( result = > result . status === 'rejected' )
. map ( result = > ( result as PromiseRejectedResult ) . reason )
if ( rejected . length ) console . warn ( '[⚠️] Parial failures during page load:' , rejected )
} catch ( err ) {
console . error ( '[❌] ERROR DURING PAGE LOAD' , JSON . stringify ( err , null , 4 ) )
return
}
if ( page_state . main_response === null ) {
page_state . main_response = await page . waitForResponse ( ( ) = > true )
}
assert ( page_state . main_response )
if ( page_state . main_response . status ( ) == 429 ) {
throw ` [⚠️] Got 429 rate-limit response, skipping this URL for now... `
}
// emulate human browsing behavior
// await disableAnimations(page, page_state);
await jiggleMouse ( page , page_state ) ;
await solveCaptchas ( page , page_state ) ;
await blockRedirects ( page , page_state ) ;
await scrollDown ( page , page_state ) ;
// await expandComments(page, page_state);
await submitForm ( page , page_state ) ;
// await blockJSExecution(page, page_state);
console . log ( '[3/4]-------------------------------------------------------------------------' )
// stop tampering with page requests & JS / recording metadata / traffic log
await stopMetadataRecording ( page , page_state )
// do all synchonous archiving steps that need exclusive use of the whole page while doing stuff
const saveScreenrecording_promise = saveScreenrecording ( page , page_state ) ;
await saveScreenshot ( page , page_state ) ;
await savePDF ( page , page_state ) ;
console . log ( '[4/4]-------------------------------------------------------------------------' )
// do all async archiving steps that can be run at the same time
await inlineShadowDOM ( page , page_state ) ;
const results = await Promise . allSettled ( [
saveTitle ( page , page_state ) ,
saveSEO ( page , page_state ) ,
saveFavicon ( page , page_state ) ,
saveSSL ( page , page_state ) ,
saveRequests ( page , page_state ) ,
saveRedirects ( page , page_state ) ,
saveHeaders ( page , page_state ) ,
saveRaw ( page , page_state ) ,
saveDOM ( page , page_state ) ,
saveBodyText ( page , page_state ) ,
// savePandoc(page, page_state),
saveReadability ( page , page_state ) ,
saveAccessibility ( page , page_state ) ,
saveOutlinks ( page , page_state ) ,
// saveAuthStorage(page, page_state),
saveAIQualityAssuranceResult ( page , page_state ) ,
] ) ;
// do all sync archiving steps that require browser extensions at the very end (they are the buggiest)
const bg_results = Promise . allSettled ( [
saveScreenrecording_promise ,
saveSinglefile ( page , page_state ) ,
// saveArchiveWebPage(page, page_state),
// savePocket(page, page_state),
] )
const { duration } = await saveMetrics ( page , page_state ) ;
const rejected = results
. filter ( result = > result . status === 'rejected' )
. map ( result = > ( result as PromiseRejectedResult ) . reason ) // not sure why this has a ts-error, .reason does exist on rejected promises
if ( rejected . length )
console . warn ( '[⚠️] Parial failures during archiving:' , rejected )
// Start an interactive REPL here with the `page` instance.
// https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl
// await page.repl()
// await page.browser().repl()
console . log ( ` [✅] ${ ANSI . blue } Finished archiving in ${ duration / 1000 } s. ${ ANSI . reset } ` )
try {
const rejected = ( await bg_results )
. filter ( result = > result . status === 'rejected' )
. map ( result = > ( result as PromiseRejectedResult ) . reason ) // not sure why this has a ts-error, .reason does exist on rejected promises
if ( rejected . length )
console . warn ( '[⚠️] Parial failures during wrap-up tasks:' , rejected )
console . log ( '[🗑️] Resetting to about:blank to ensure memory is freed...' )
await page . goto ( 'about:blank' )
await page . close ( )
} catch ( err ) {
console . log ( err )
}
// symlink the best results from across all the versions/ into the snapshot dir root
await symlinkBestSnapshotResults ( snapshot_dir )
// display latest version screenshot GIF
console . log ( )
try {
const latest_version_gif = path . join ( snapshot_dir , 'versions' , page_state . version , path . basename ( SCREENRECORDGIF_PATH ( page ) ) )
const dirent = await blockUntilExists ( latest_version_gif , { min_bytes : 100 , timeout : 15_000 } )
child_process . spawn ( '/Users/squash/.iterm2/imgcat' , [ dirent . abspath ] , { stdio : [ null , 'inherit' , 'inherit' ] } )
} catch ( err ) {
console . warn ( '[⚠️] Failed to display screenrecording.gif...' , err )
console . log ( )
}
// determine whether task succeeded or failed based on AI QA score
const latest_version_aiqa = path . join ( snapshot_dir , 'versions' , page_state . version , path . basename ( AIQA_PATH ( page ) ) )
const qa_results = JSON . parse ( ( await fs . promises . readFile ( latest_version_aiqa ) ) . toString ( ) )
if ( qa_results . pct_visible < 50 ) {
throw ` [❌] Task completed with problems, got AI QA score of ${ qa_results . pct_visible } %! ${ qa_results . warnings . join ( ', ' ) } ${ qa_results . error_text || '' } `
} else {
console . log ( ` [💫] Task completed succesfully: ${ qa_results . pct_visible } % ${ qa_results . warnings . join ( ', ' ) || '' } ` )
console . log ( ` Summary: ${ ( qa_results . main_content_title || qa_results . description || 'No title/description detected' ) . substring ( 0 , 80 ) } ... ${ qa_results . main_content_author || '' } ${ qa_results . main_content_date || '' } ` )
return true
}
}
async function passiveArchiveTask ( { browser , page , url } ) {
// archive passively (e.g. a tab that was opened already by a human), without changing the active page
const is_unarchivable_url = URL_SCHEMES_IGNORED . includes ( url . split ( ':' ) [ 0 ] )
const is_already_archived = ALREADY_ARCHIVED . has ( url . slice ( 0 , 4096 ) )
if ( is_unarchivable_url || is_already_archived ) return null
ALREADY_ARCHIVED . add ( url . slice ( 0 , 4096 ) )
// these have to be as early as possible because we're racing with the page load (we might even be too late)
// jk nevermind, we now re-open a new bg tab for every tab that's created to re-capture the initial request
// await page.setRequestInterception(true);
// await page.setCacheEnabled(false);
const original_url = url . toString ( )
const start_time = ( new Date ( ) )
const browser_version = await browser . version ( )
console . log ( '------------------------------------------------------------------------------' )
console . log ( '[➕ ] Starting archive of new tab opened in driver browser...' , await browser . version ( ) )
const snapshot_dir = await setupSnapshotDir ( { original_url , start_time } )
const snapshot = await setupSnapshotDB ( { original_url , start_time , snapshot_dir } )
console . log ( '------------------------------------------------------------------------------' )
console . log ( ` [🪟] Starting page & viewport setup ( ${ browser_version } ${ DEFAULT_VIEWPORT . isMobile ? 'mobile' : 'desktop' } ${ DEFAULT_VIEWPORT . width } x ${ DEFAULT_VIEWPORT . height } px)... ` )
// create a new page in the background for archiving
const old_page = page
page = await browser . newPage ( )
await old_page . bringToFront ( )
const client = await page . target ( ) . createCDPSession ( )
const extensions = await getChromeExtensionsFromCache ( { browser } )
const page_state = {
// global static state
browser ,
client ,
browser_version ,
extensions ,
// per-page static metadata
original_url ,
snapshot ,
snapshot_dir ,
start_time : start_time.toISOString ( ) ,
start_ts : Number ( start_time ) ,
version : versionStrFromDate ( start_time ) ,
// per-page mutable archiving state
main_response : null ,
recorder : null ,
console_log : [ ] ,
traffic_log : { } ,
redirects : { } ,
}
page . _original_url = original_url
try {
// run all page setup functions in parallel
const results = await Promise . allSettled ( [
// loadAuthStorage(page, page_state, {apply: true}),
startMetadataRecording ( page , page_state ) ,
setupURLRewriting ( page , page_state ) ,
startResponseSaving ( page , page_state ) ,
saveYTDLP ( page , page_state ) ,
saveGALLERYDL ( page , page_state ) ,
// saveSourceMaps(page, page_state),
] ) ;
const rejected = results
. filter ( result = > result . status === 'rejected' )
. map ( result = > ( result as PromiseRejectedResult ) . reason )
if ( rejected . length ) console . warn ( '[⚠️] Parial failures during page setup:' , rejected )
} catch ( err ) {
console . warn ( '[❌] ERROR DURING PAGE SETUP' , JSON . stringify ( err , null , 4 ) )
return
}
// load the url in the background page, then switch to it once its loaded and close the original tab
console . log ( '[➡️] NAVIGATION[INI]' , ANSI . blue + url + ANSI . reset )
const startrecording_promise = startScreenrecording ( page , page_state )
page_state . main_response = await page . goto ( url , { waitUntil : 'load' , timeout : 40_000 } )
// for debugging
globalThis . page = page
globalThis . page_state = page_state
// start loading the page, start screenrecording, close the old page, and wait for loading to finish (all at once, fine for these to race)
try {
const results = await Promise . allSettled ( [
startrecording_promise ,
page . bringToFront ( ) ,
old_page . close ( ) ,
page . waitForNetworkIdle ( { concurrency : 0 , idleTime : 900 , timeout : 20_000 } ) ,
] )
const rejected = results
. filter ( result = > result . status === 'rejected' )
. map ( result = > ( result as PromiseRejectedResult ) . reason )
if ( rejected . length ) console . warn ( '[⚠️] Parial failures during [age load:' , rejected )
} catch ( err ) {
console . warn ( '[❌] ERROR DURING PAGE LOAD' , JSON . stringify ( err , null , 4 ) )
return
}
if ( page_state . main_response === null ) {
page_state . main_response = await page . waitForResponse ( ( ) = > true )
}
assert ( page_state . main_response )
if ( page_state . main_response . status ( ) == 429 ) {
throw ` [⚠️] Got 429 rate-limit response, skipping this URL for now... `
}
// resume page if paused by waitForDebuggerOnStart/dev tools debugger/backgrounding
try {
await client . send ( 'Page.enable' ) ;
await client . send ( 'Page.setWebLifecycleState' , { state : 'active' } ) ;
await client . send ( 'Runtime.runIfWaitingForDebugger' )
} catch ( err ) { /* console.warn(err) */ }
// wait a couple seconds for page to finish loading
await wait ( 5 _000 )
// emulate human browsing behavior
// await disableAnimations(page, page_state);
// await jiggleMouse(page, page_state);
await solveCaptchas ( page , page_state ) ;
// await blockRedirects(page, page_state);
// await scrollDown(page, page_state);
// await expandComments(page, page_state);
await submitForm ( page , page_state ) ;
// await blockJSExecution(page, page_state);
await stopMetadataRecording ( page , page_state ) // stop tampering with page requests & JS
console . log ( '[3/4]-------------------------------------------------------------------------' )
// do all synchonous archiving steps that need exclusive use of the whole page while doing stuff
const saveScreenrecording_promise = saveScreenrecording ( page , page_state ) ;
await saveScreenshot ( page , page_state ) ;
await savePDF ( page , page_state ) ;
console . log ( '[4/4]-------------------------------------------------------------------------' )
// do all async archiving steps that can be run at the same time
await inlineShadowDOM ( page , page_state ) ;
const results = await Promise . allSettled ( [
saveTitle ( page , page_state ) ,
saveSEO ( page , page_state ) ,
saveFavicon ( page , page_state ) ,
saveSSL ( page , page_state ) ,
saveRequests ( page , page_state ) ,
saveRedirects ( page , page_state ) ,
saveHeaders ( page , page_state ) ,
saveRaw ( page , page_state ) ,
saveDOM ( page , page_state ) ,
saveBodyText ( page , page_state ) ,
// savePandoc(page, page_state),
saveReadability ( page , page_state ) ,
saveAccessibility ( page , page_state ) ,
saveOutlinks ( page , page_state ) ,
// saveAuthStorage(page, page_state),
saveAIQualityAssuranceResult ( page , page_state ) ,
] ) ;
// do all sync archiving steps that require browser extensions at the very end (they are the buggiest)
const bg_results = Promise . allSettled ( [
saveScreenrecording_promise ,
saveSinglefile ( page , page_state ) ,
// saveArchiveWebPage(page, page_state),
// savePocket(page, page_state),
] )
const { duration } = await saveMetrics ( page , page_state ) ;
const rejected = results
. filter ( result = > result . status === 'rejected' )
. map ( result = > ( result as PromiseRejectedResult ) . reason )
if ( rejected . length )
console . warn ( '[⚠️] Parial failures during page archiving:' , rejected )
// Start an interactive REPL here with the `page` instance.
// https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl
// await page.repl()
// await page.browser().repl()
console . log ( ` [✅] Finished archiving in ${ duration / 1000 } s. ` , )
// await page.tracing.stop();
try {
const rejected = ( await bg_results )
. filter ( result = > result . status === 'rejected' )
. map ( result = > ( result as PromiseRejectedResult ) . reason )
if ( rejected . length )
console . warn ( '[⚠️] Parial failures during page wrap-up tasks:' , rejected )
} catch ( err ) {
console . log ( err )
}
await symlinkBestSnapshotResults ( snapshot_dir )
}
/******************************************************************************/
/************************* Page Setup Tasks ***********************************/
async function setupSnapshotDir ( { original_url , start_time , snapshot_dir = null } ) {
// setup archive/<id> snapshot output folder, move old files into versions/<date>/* + clear any existing symlinks
const snap_dir = snapshot_dir || TASK_PATH ( original_url )
console . log ( )
console . log ( )
console . log ( ANSI . blue + original_url + ANSI . reset )
console . log ( ANSI . black + snap_dir + ANSI . reset )
console . log ( )
console . log ( '[📂] Setting up Snapshot output directory...' . padEnd ( 82 ) , prettyPath ( snap_dir ) )
// check for existing data at old legacy paths e.g. ./data/archive/1999999999.1723425
const hacky_dir = path . join ( ARCHIVE_DIR , ` 1999999999. ${ hashCode ( original_url ) } ` )
const known_dir = SNAPSHOT_DIRS_BY_URL [ original_url ]
const known_dir_exists = fs . existsSync ( known_dir )
const hacky_dir_exists = fs . existsSync ( hacky_dir )
if ( snap_dir == hacky_dir ) {
if ( known_dir_exists ) {
throw ` Tried to create snapshot in ${ snap_dir } but potential duplicate exists in ${ known_dir } ! `
}
} else if ( snap_dir == known_dir ) {
if ( hacky_dir_exists ) {
throw ` Tried to create snapshot in ${ snap_dir } but potential duplicate exists in ${ hacky_dir } ! `
}
} else {
if ( known_dir_exists ) {
throw ` Tried to create snapshot in ${ snap_dir } but potential duplicate exists in ${ known_dir } ! `
} else if ( hacky_dir_exists ) {
throw ` Tried to create snapshot in ${ snap_dir } but potential duplicate exists in ${ hacky_dir } ! `
} else {
throw ` Tried to create snapshot in ${ snap_dir } but its not a recognized snapshot dir path: \ n - ${ known_dir } \ n - ${ hacky_dir } `
}
}
// mkdir -p ./data/archive/<snap_id>/versions && cd ./data/archive/<snap_id>
await fs . promises . mkdir ( path . join ( snap_dir , 'versions' ) , { recursive : true } )
process . chdir ( snap_dir )
// clear any /data/archive/<snap_id>/*.* symlinks pointing to existing ./versions/<versionid>/*.* files
await clearSnapshotDirSymlinks ( snap_dir )
// move /data/archive/<snap_id>/*.* loose output files from any prior run into ./versions/<versionid>/*.*
await collectSnapshotDirVersionFiles ( snap_dir )
// update /data/indexes/<index_name>/* to include references to /data/archive/<snap_id> as-needed
await updateSnapshotDirIndexes ( snap_dir , { original_url , start_time } )
// assert /data/archive/<snap_id>/ contains no invalid/partial files + is empty/ready to receive new files
await assertSnapshotDirIsValid ( snap_dir , { is_empty : true } )
return snap_dir
}
// ./index/<index_name> : index_getter(page_state) => "<index_key_str>"
const INDEXES = {
snapshots_by_day : ( { start_time } ) = >
versionStrFromDate ( start_time , { withDate : true , withTime : false } ) ,
snapshots_by_domain : ( { original_url } ) = >
( new URL ( original_url ) ) . hostname || '' , // hostname does not include :port
}
async function updateSnapshotDirIndexes ( snap_dir , page_state , indexes = INDEXES , indexes_dir = INDEXES_DIR ) {
assert ( indexes )
console . log ( ` [🔎] Linking Snapshot in indexes ( ${ Object . keys ( indexes ) . join ( ', ' ) } )... ` )
// const {snapshot_dir, original_url, start_ts} = page_state
for ( const [ index_name , index_key_getter ] of Object . entries ( indexes ) ) {
const index_entry = await indexSnapshotDir ( snap_dir , { index_name , index_key_getter , indexes_dir } , page_state )
}
}
async function indexSnapshotDir ( snap_dir , { index_name , index_key_getter , indexes_dir = INDEXES_DIR } , page_state ) {
// place symlinks to this snapshot in any /indexes/<index_name/ indexes as-needed
// const snap_id = snap_dir.split('/').at(-1)
const index_dir = path . join ( indexes_dir , index_name ) // /data/index/snapshots_by_day
await fs . promises . mkdir ( index_dir , { recursive : true } )
// calculate the index key, e.g. "200101231" or "example.com"
assert ( index_name && index_key_getter )
assert ( page_state )
const index_key = String ( index_key_getter ( page_state ) ) // '20010131'
assert ( index_key )
const snap_id = path . parse ( snap_dir ) . base // '19999999.23423523'
assert ( snap_id )
const index_entries_dir = path . join ( index_dir , index_key ) // /data/index/snapshots_by_day/20010131
await fs . promises . mkdir ( index_entries_dir , { recursive : true } )
const symlink_path = path . join ( index_entries_dir , snap_id ) // /data/index/snapshots_by_day/20010131/19999999.23423523
// create symlink index/snapshots_by_day/<YYYYMMDD>/<snap id> -> ./archive/<snap_id> symlink
const { symlink_abspath } = await overwriteSymlink ( snap_dir , symlink_path , { relative : true , mkdirs : false } )
}
async function collectSnapshotDirVersionFiles ( snap_dir ) {
// move archive/<id>/*.* snapshot output files into archive/<id>/versions/<date>/* dated version folder
// detect start time / version info from previous result metrics.json
const snap_id = snap_dir . split ( '/archive/' ) . at ( - 1 )
const existing_metrics = path . join ( snap_dir , 'metrics.json' )
let { start_time , VERSION } = { start_time : '1970-01-01T00:00:00.000Z' , VERSION : '19700101000000' }
try {
; ( { start_time , VERSION } = JSON . parse ( await fs . promises . readFile ( existing_metrics , 'utf-8' ) ) ) ;
} catch ( err ) {
// continue normally, overwriting existing files is fine if they're broken to begin with
}
// create new version folder based on metrics.json start_time (or epoch time as fallback for legacy output)
const version_dir_name = VERSION || versionStrFromDate ( start_time )
const version_dir = path . join ( snap_dir , 'versions' , version_dir_name )
await fs . promises . mkdir ( version_dir , { recursive : true } )
// move all result files from snapshot_dir root into version folder
const existing_snapshot_files =
( await fs . promises . readdir ( snap_dir , { withFileTypes : true } ) )
. filter ( dirent = > {
if ( dirent . name . startsWith ( '.' ) ) return false // ignore hidden files, dont version them
if ( dirent . name == 'versions' ) return false // dont try to move versions folder into itself
if ( dirent . isSymbolicLink ( ) ) return false // skip existing symbolic links
return ( dirent . isFile ( ) || dirent . isDirectory ( ) ) // dont try to version sockets/FIFOs/devs etc.
} )
if ( existing_snapshot_files . length ) {
console . log ( ` [📅] Moving snapshot results into version dir: ./data/archive/ ${ snap_id } /* -> ` . padEnd ( 82 ) , ` ./data/archive/ ${ snap_id } /versions/ ${ VERSION } / ` )
}
const snapshot_files = await getDirInfo ( snap_dir , { withRoot : false , filter : ( { relpath } ) = > ! relpath . startsWith ( 'versions' ) } )
const version_files = await getDirInfo ( version_dir , { withRoot : false } )
for ( const { name } of existing_snapshot_files ) {
const snapdir_entry_abspath = path . join ( snap_dir , name )
const versioned_entry_abspath = path . join ( version_dir , name )
const snapshot_entry = snapshot_files [ name ]
const version_entry = version_files [ name ]
if ( snapshot_entry && version_entry ) {
// a conflicting file/dir already exists in the destination path
// we have a few options here, we can try to merge them, or we can create a new version
if ( snapshot_entry . sha256 == version_entry . sha256 ) {
// both are the same already, delete the duplicate (leaving the copy inside the version dir)
// if (snapshot_entry.is_dir) {
// await fs.promises.rmdir(snapshot_entry.abspath, {recursive: true})
// } else {
// await fs.promises.unlink(snapshot_entry.abspath)
// }
// console.warn(`[!] Found harmless exact duplicate files, leaving as is: ${snapshot_entry.summary} and ${version_entry.summary}`)
} else {
// both are different,
if ( snapshot_entry . num_bytes > version_entry . num_bytes ) {
// snapshot entry is bigger, keep it and delete version entry?
} else {
// version entry is bigger, keep it and delete snapshot entry
}
console . warn ( ' ' , snapshot_entry . summary )
console . warn ( ' ' , version_entry . summary )
// throw `Found conflicting duplicate files with different contents: ${name}`
}
} else {
// mv ./data/archive/<snap_id>/example.txt -> ./data/archive/<snap_id>/versions/<version_id>/example.txt
await fs . promises . rename ( snapdir_entry_abspath , versioned_entry_abspath )
console . log ( ` ↣ ${ prettyPath ( snapdir_entry_abspath ) } -> ` . padEnd ( 82 ) , prettyPath ( versioned_entry_abspath ) )
}
}
}
// Extractor definition
// {
// phase: setup | load | sync1 | async1 | sync2 | close
// name: 'media' | 'photos', 'wget', 'singlefile'
//
// shouldRun(page, page_state)
// pageSetup
// pageLoad
// pageInteraction clicking around/scrolling
// archivePhase1 sync
// archivePhase2 async
// archivePhase3 async
// pageClose
// execute(page, page_state)
// validateResult(page, page_state)
// }
async function clearSnapshotDirSymlinks ( snap_dir ) {
// delete all archive/<id>/* symlinks in preparation for new snapshot output to be placed there
const existing_symlinks =
( await fs . promises . readdir ( snap_dir , { withFileTypes : true } ) )
. filter ( dirent = > {
if ( dirent . name . startsWith ( '.' ) ) return false // ignore hidden files, dont version them
if ( dirent . name == 'versions' ) return false // dont try to move versions folder into itself
return dirent . isSymbolicLink ( )
} )
for ( const { name : existing_symlink } of existing_symlinks ) {
await fs . promises . unlink ( path . join ( snap_dir , existing_symlink ) )
// if symlinks are not cleared before starting, it can cause issues with outputs writing into previous versions folders
// e.g. screerecording saves to ./media which could be pointing to previous version's ./versions/<olddate>/media
}
}
async function symlinkBestSnapshotResults ( snap_dir ) {
// move any existing files into versions/<date> folder (clear out main folder)
// symlink latest files from versions/<date>/* into main folder
await fs . promises . mkdir ( path . join ( snap_dir , 'versions' ) , { recursive : true } )
process . chdir ( snap_dir )
const metrics_file = path . join ( snap_dir , 'metrics.json' )
// if (!fs.existsSync(metrics_file) || (await fs.promises.lstat(metrics_file)).isSymbolicLink()) {
// console.warn('[⚠️] Warning, found partial dirty snapshot state (did the snapshot get interrupted?)', snap_dir)
// }
// move output files into versioned folder
await collectSnapshotDirVersionFiles ( snap_dir )
// clear any existing symlinks
await clearSnapshotDirSymlinks ( snap_dir )
// assert task dir is empty and contains no bare files that might get overwritten, also asserts version dirs are valid
await assertSnapshotDirIsValid ( snap_dir , { is_empty : true } )
const version_dirs = ( await fs . promises . readdir ( path . join ( snap_dir , 'versions' ) ) ) . sort ( ) // earliest to latest
const most_recent = version_dirs . at ( - 1 )
// for each version dir in versions/ (oldest -> newest)
for ( const version_dir of version_dirs ) {
if ( version_dir . startsWith ( '.' ) ) continue
const version_dir_abspath = path . join ( snap_dir , 'versions' , version_dir )
const version_dir_files = (
( await fs . promises . readdir ( version_dir_abspath ) )
. filter ( filename = > ! filename . startsWith ( '.' ) ) )
// iterate through all the files/folders in the version dir
for ( const filename of version_dir_files ) {
const snapdir_entry = path . join ( snap_dir , filename ) // ./data/archive/<snapid>/filename
const versiondir_entry = path . join ( snap_dir , 'versions' , version_dir , filename ) // ./data/archive/<snapid>/versions/<versionid>/filename
if ( fs . existsSync ( snapdir_entry ) ) {
// if an entry already exists in the snapshot root for this filename
if ( ( await fs . promises . lstat ( snapdir_entry ) ) . isSymbolicLink ( ) ) {
// if a symlink already exists in the root with the same name,
// check if the version file we're looking at is a better candidate to replace it
const existing_abspath = await fs . promises . realpath ( snapdir_entry )
const desired_abspath = path . join ( version_dir_abspath , filename )
if ( existing_abspath != desired_abspath ) {
// check if the new candidate is larger or if the existing symlink is larger (largest file = most likely to be highest quality capture data)
const largest_path = await getLargestPath ( existing_abspath , desired_abspath )
if ( largest_path != ( await fs . promises . realpath ( existing_abspath ) ) ) {
const larger_version = path . basename ( path . dirname ( largest_path ) )
const larger_abspath = path . join ( snap_dir , 'versions' , larger_version , filename )
// console.log(' - swapping for larger file:', filename, '->', larger_abspath.split('/archive/').at(-1))
await overwriteSymlink ( larger_abspath , snapdir_entry , { search_limit : snap_dir } )
} else {
// console.log(' - leaving larger file:', largest_path.split('/archive/').at(-1))
}
} else {
// leave existing symlink pointing to current version file, nothing to change
// console.log(' - leaving current file:', existing_abspath.split('/archive/').at(-1))
}
} else {
// clearSnapshotDirSymlinks() should have already cleared these files out!
throw ` Non-symlink file found in root of snapshot dir! Refusing to overwrite: ${ prettyPath ( snapdir_entry ) } `
}
} else {
// no entry exists in the snapshot root for this filename, create one by linking to the version file
await overwriteSymlink ( versiondir_entry , snapdir_entry , { search_limit : snap_dir } )
}
// if (version_dir == most_recent) {
// // only log most recent links even though we link older ones too (otherwise its too noisy)
// console.log(` 🔗 ./${filename} -> ./${versiondir_entry} linking...`)
// }
}
}
return snap_dir
}
async function assertSnapshotDirIsValid ( snap_dir , { is_empty = false } = { } ) {
process . chdir ( snap_dir )
console . log ( )
console . log ( ` [☑️] Checking that snapshot records are valid... ` )
// get all directory entries in archive/<snapshot_id>/*
const snapshot_dir_entries =
( await fs . promises . readdir ( snap_dir , { withFileTypes : true } ) )
. filter ( dirent = > {
if ( dirent . name . startsWith ( '.' ) ) return false
if ( dirent . name == 'versions' ) return false
} )
// assert versions folder exists and is not a symbolic link
const versions_dir = path . join ( snap_dir , 'versions' )
assert ( fs . existsSync ( versions_dir ) )
assert ( ! ( await fs . promises . lstat ( versions_dir ) ) . isSymbolicLink ( ) )
// if it should be empty, check that no loose files exist
if ( is_empty ) {
assert ( ! snapshot_dir_entries . length , ` Found loose files in snapshot-dir that shouldn't be there! ${ snap_dir } ` )
}
// assert all non-hidden files in snapshot dir are symbolic links to actual data in versions/<date>/*
for ( const snapshot_dir_entry of snapshot_dir_entries ) {
if ( snapshot_dir_entry . name . startsWith ( '.' ) ) continue
if ( snapshot_dir_entry . name == 'versions' ) continue
assert ( snapshot_dir_entry . isSymbolicLink ( ) , ` Found non-symbolic link in root of snapshot dir! ${ snap_dir } / ${ snapshot_dir_entry . name } ` )
assert ( fs . existsSync ( snapshot_dir_entry . name ) , ` Found broken symbolic link in root of snapshot dir! ${ snap_dir } / ${ snapshot_dir_entry . name } ` )
}
const version_entries = (
( await fs . promises . readdir ( versions_dir ) )
. filter ( foldername = > ! foldername . startsWith ( '.' ) )
. sort ( ) )
console . log ( ` √ ${ prettyPath ( versions_dir ) } ` , version_entries . length )
for ( const version_dir of version_entries ) {
await assertVersionDirIsValid ( path . join ( versions_dir , version_dir ) )
}
// write snapshot dir file listing w/ sizes & hashes to .files.json
const directory_info = await getDirInfo ( snap_dir , { withRoot : true , withHelpers : false , maxdepth : 3 } )
await overwriteFile ( path . join ( snap_dir , '.files.json' ) , directory_info )
}
async function assertVersionDirIsValid ( version_dir ) {
const dirname = path . parse ( version_dir ) . name
assert ( fs . existsSync ( version_dir ) , ` Version dir does not exist: ${ prettyPath ( version_dir ) } ` )
const dirent = await fs . promises . lstat ( version_dir )
assert ( dirent . isDirectory ( ) && ! dirent . isSymbolicLink ( ) , ` Found non-directory in versions dir! ${ prettyPath ( version_dir ) } ` )
const unix_epoch = '19700101000000'
const is_name_valid_datestr = /^\d+$/ . test ( dirname ) && ( dirname . length == 14 ) && ( dirname . startsWith ( '2' ) || dirname == unix_epoch ) && parseVersionDateStr ( dirname )
assert ( is_name_valid_datestr , ` Version directories must be a 14-character long date string like 20251231235959! ${ dirname } ` )
// get all directory entries in archive/<snapshot_id>/versions/<version_id>/*
const version_dir_entries = (
( await fs . promises . readdir ( version_dir , { withFileTypes : true } ) )
. filter ( ( dirent ) = > ! dirent . name . startsWith ( '.' ) ) )
// assert version dir contains only actual snapshot output files (not-symbolic links or other version dirs)
for ( const version_dir_entry of version_dir_entries ) {
assert ( version_dir_entry . name != 'versions' , ` Version dir cannot contain another versions folder! ${ prettyPath ( version_dir ) } /versions ` )
assert ( ! version_dir_entry . isSymbolicLink ( ) , ` Version dir cannot contain symbolic link! ${ prettyPath ( version_dir ) } / ${ version_dir_entry . name } ` )
}
// color highlight the unix epoch version in black, and any version created today in blue
let pretty_dirname = dirname
if ( dirname == unix_epoch ) {
pretty_dirname = ANSI . black + unix_epoch + ANSI . reset
}
const today = versionStrFromDate ( new Date ( ) , { withDate : true , withTime : false } )
if ( dirname . startsWith ( today ) ) {
pretty_dirname = ANSI . blue + dirname + ANSI . reset
}
// write version dir file listing w/ sizes & hashes to .files.json
const directory_info = await getDirInfo ( version_dir , { withRoot : true , withHelpers : false , maxdepth : 3 } )
await overwriteFile ( path . join ( version_dir , '.files.json' ) , directory_info )
console . log ( ` √ ./versions/ ${ pretty_dirname } contains ` , version_dir_entries . length , 'results' )
}
async function setupSnapshotDB ( { original_url , start_time , snapshot_dir } ) {
// setup Snapshot database row, finding it if it already exists or creating a new one
const timestamp = snapshot_dir . split ( '/' ) . at ( - 1 )
const search_attrs = { url : original_url , timestamp }
const update_attrs = { url : original_url , timestamp , added : start_time , title : null }
let snapshot = await Snapshot . findOne ( { where : search_attrs } ) ;
let created = false
if ( ! snapshot ) {
snapshot = await Snapshot . findOne ( { where : { url : original_url } } ) ;
if ( snapshot ) {
// console.warn(`[X] Found DB Snapshot [${timestamp}](${original_url.substring(0, 30)}...) that has different timestamp from existing dir ${prettyPath(snapshot_dir)}!`)
// throw 'Snapshot DB record does not match filesystem path!'
} else {
console . log ( ` [+] Creating new DB Snapshot [ ${ timestamp } ]( ${ original_url . substring ( 0 , 30 ) } ...) for ${ prettyPath ( snapshot_dir ) } ... ` )
// ;([snapshot, created] = await Snapshot.findOrCreate({where: search_attrs, defaults: update_attrs }));
// throw 'Wanted to create new Snapshot but refusing to modify DB during testing!'
}
}
// assert(snapshot && (snapshot instanceof Snapshot))
return snapshot
}
async function setupViewport ( page , _page_state ) {
// setup viewport
await page . setViewport ( DEFAULT_VIEWPORT ) ;
await page . setGeolocation ( DEFAULT_GEOLOCATION ) ;
// await page.setBypassCSP(true); // bypass CSP restrictions (requires --disable-web-security)
page . setDefaultTimeout ( DEFAULT_TIMEOUT ) ;
// Optional: emulate a mobile device
// await page.emulate(puppeteer.devices['iPhone 6']);
// Configure light mode/dark mode & accessibility reduced motion preferences
await page . emulateMediaFeatures ( [
{ name : 'prefers-color-scheme' , value : DEFAULT_COLOR_SCHEME } ,
{ name : 'prefers-reduced-motion' , value : 'reduce' } ,
] ) ;
// Setup headers & deterministically chose a random referrer based on URL
const rand_idx = hashCode ( await page . url ( ) ) % DEFAULT_REFERRERS . length
await page . setExtraHTTPHeaders ( {
. . . DEFAULT_HEADERS ,
referrer : DEFAULT_REFERRERS [ rand_idx ] ,
} )
// Setup alert to trigger if site tries to sniff whether we are a bot
function sniffDetector() {
const userAgent = window . navigator . userAgent ;
const platform = window . navigator . platform ;
// @ts-ignore
window . navigator . __defineGetter__ ( 'userAgent' , function ( ) {
// @ts-ignore
window . navigator . sniffed = true ;
return userAgent ;
} ) ;
// @ts-ignore
window . navigator . __defineGetter__ ( 'platform' , function ( ) {
// @ts-ignore
window . navigator . sniffed = true ;
return platform ;
} ) ;
}
await page . evaluateOnNewDocument ( sniffDetector ) ;
// @ts-ignore
const was_sniffed = await page . evaluate ( ( ) = > ( ! ! window . navigator . sniffed ) )
if ( was_sniffed ) {
console . warn ( '[⚠️] Site tried to sniff if we are a bot! Site may be difficult to archive.' )
}
return page
}
async function setupModalAutoClosing ( page , page_state , { timeout = 1 _250 } = { } ) {
page . on ( 'dialog' , ( dialog ) = > {
console . log ( ` [👆] Auto-closing modal that popped up: ${ dialog . message ( ) } ... ` )
setTimeout ( ( ) = > { try { dialog . accept ( ) } catch ( err ) { } } , timeout ) ;
} )
// if you expect a file-upload dialog, use this to catch it instead:
// const [fileChooser] = await Promise.all([
// page.waitForFileChooser(),
// ]);
// await fileChooser.accept(['/tmp/myfile.pdf']);
page . on ( 'close' , ( ) = > {
try {
page . off ( 'dialog' )
} catch ( err ) { }
} )
}
async function startScreenrecording ( page , page_state , { duration_limit = 60 , codec = 'libx264' } = { } ) {
await fs . promises . mkdir ( path . dirname ( SCREENRECORDING_PATH ( page ) ) , { recursive : true } )
// console.log(`[🎬] Starting screen-recording stream...`.padEnd(82), prettyPath(SCREENRECORDING_PATH(page)))
// alternative: interact with low-level puppeteer screencast API directly
// using puppeteer.page.screencast: https://pptr.dev/api/puppeteer.page.screencast
// const recorder = await page.screencast({path: SCREENRECORDING_PATH(page)});
// alternative: use puppeteer-stream for .webm/.mp4 screen recordings with tab audio included
// works sometimes but has a few issues, e.g.: https://github.com/SamuelScheit/puppeteer-stream/issues/8
// alternative: puppeteer-screen-recorder (most compatible/stable but doesn't include tab audio output)
const recorder = new PuppeteerScreenRecorder ( page , {
followNewTab : false ,
recordDurationLimit : duration_limit ,
// fps: 25,
// ffmpeg_Path: '<path of ffmpeg_path>' || null,
// videoFrame: {
// width: 1024,
// height: 768,
// },
// videoCrf: 18,
videoCodec : codec ,
// videoPreset: 'ultrafast',
// videoBitrate: 1000,
// autopad: {
// color: 'black' | '#35A5FF',
// },
// aspectRatio: '4:3',
} ) ;
page_state . recorder = recorder
await recorder . start ( SCREENRECORDING_PATH ( page ) )
page . on ( 'close' , async ( ) = > { await saveScreenrecording ( page , page_state ) } ) ;
return page_state
}
async function startResponseSaving ( page , page_state ) {
const dir = RESPONSES_PATH ( page )
await fs . promises . mkdir ( dir , { recursive : true } )
console . log ( ` [🌄] Starting raw response bytes recording... ` . padEnd ( 82 ) , prettyPath ( dir ) + '/' )
// Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Preflight, Other
const types_to_save = [
// 'document',
'script' ,
'stylesheet' ,
'font' ,
'image' ,
'media' ,
'xhr' ,
'websocket' ,
]
// reset responses index file to empty
const responses_log_path = path . join ( dir , 'index.jsonl' )
await overwriteFile ( responses_log_path , '' )
// add handler to save all image repsonses into output directory
page . on ( 'response' , async ( response ) = > {
try {
const timestamp = versionStrFromDate ( new Date ( ) , { withDate : true , withTime : true , withSeconds : true , withMilliseconds : true } )
if ( ! page_state . main_response && ( response . request ( ) . url ( ) == page_state . original_url ) ) {
// save first response as main page response (if we havent already caught it earlier)
page_state . main_response = response
}
const status = response . status ( )
if ( ( status >= 300 ) && ( status < 500 ) ) {
// console.log('Got bad response from', response.url(), 'to', response.headers()['location'])
return
}
const request = response . request ( )
const resourceType = request . resourceType ( )
const url_scheme = ( response . url ( ) || request . url ( ) ) . split ( ':' ) [ 0 ] . toLowerCase ( )
const method = ( url_scheme === 'data' ) ? 'DATA' : request . method ( )
// console.log(' ', resourceType, response.url())
if ( types_to_save . includes ( resourceType ) ) {
// create ./responses/xhr/www.facebook.com/static/images/icons/ subdir based on hostname + path
const resource_type_dir = path . join ( dir , resourceType )
const url = new URL ( response . url ( ) )
let subdir = resource_type_dir
const url_path = ( url . pathname || '' ) . slice ( 0 , 250 ) . endsWith ( '/' )
? ( url . pathname || '' ) . slice ( 0 , 250 )
: path . dirname ( ( url . pathname || '' ) . slice ( 0 , 250 ) )
// determine subdirectory based on url type (handles http:,https:,file:,data:,chrome-extension:,about:,etc.)
if ( ! URL_SCHEMES_IGNORED . includes ( url_scheme ) ) {
// is a normal http:// or https:// url, use the domain + path to construct subdirectory
subdir = path . join ( resource_type_dir , ( url . hostname || 'data' ) . slice ( 0 , 250 ) , url_path )
} else if ( url_scheme == 'data' ) {
// is a data:... url, store in ./data subdirectory
subdir = path . join ( resource_type_dir , 'data' )
} else {
// is a chrome-extension:// or other special url, use the extension id + path to construct subdirectory
const url_path = path . dirname ( ( url . pathname || '' ) . slice ( 0 , 999 ) )
subdir = path . join ( resource_type_dir , url_scheme , ( url . hostname || 'data' ) . slice ( 0 , 250 ) , url_path )
}
// write response to responses/all/1716861056899__https%3A%2F%2Fwww.instagram.com%2Fgraphql%2Fquery.json
let abspath = null
let resp_mimetype = null
let extension = ''
let uniq_filename = null
let uniq_abspath = null
let symlink_abspath = null
let responseSha256 = null
try {
await fs . promises . mkdir ( path . join ( dir , 'all' ) , { recursive : true } )
try {
await fs . promises . mkdir ( subdir , { recursive : true } )
} catch ( err ) {
subdir = subdir + '.dir' // TODO: apply this workaround to parent path entries too
try {
await fs . promises . mkdir ( subdir , { recursive : true } )
} catch ( err ) {
subdir = path . join ( resource_type_dir , 'data' )
await fs . promises . mkdir ( subdir , { recursive : true } )
}
}
; ( { abspath : symlink_abspath , resp_mimetype , extension } = await detectFilename ( { page , response , dir : subdir , resourceType } ) )
// responses/all/1716861056899__https%3A%2F%2Fwww.instagram.com%2Fgraphql%2Fquery.json
uniq_filename = ` ${ timestamp } __ ${ method } __ ` + [ encodeURIComponent ( url . href ) . slice ( 0 , 64 ) . replaceAll ( '/' , '_' ) . replace ( new RegExp ( ` . ${ extension } $ ` ) , '' ) , extension ] . filter ( s = > s . length ) . join ( '.' )
uniq_abspath = path . join ( dir , 'all' , uniq_filename )
let bytesBuffer = null
try {
bytesBuffer = await response . buffer ( )
} catch ( err ) {
if ( String ( err ) . includes ( "Cannot read properties of undefined (reading 'body')" ) ) {
// not sure why it's happening but seems to be too late to caputre body sometimes? possible race condition
} else {
console . warn ( '[⚠️] Failed to save response bytes for:' , response . request ( ) . url ( ) , err )
}
}
if ( bytesBuffer ) {
// write response data into ./all/<TS>__<METHOD>__<URL>.<EXT>
await overwriteFile ( uniq_abspath , bytesBuffer )
responseSha256 = crypto . createHash ( 'sha256' ) . update ( bytesBuffer ) . digest ( 'hex' )
// write symlink file to ./<TYPE>/<DOMAIN>/...<PATH>/<FILENAME>.<EXT> -> ./all/<TS>__<METHOD>__<URL>.<EXT>
await overwriteSymlink ( uniq_abspath , symlink_abspath , { relative : dir , mkdirs : true , search_limit : dir } )
}
// console.log(' ->', symlink_abspath)
} catch ( err ) {
// dont do anything for redirectresponses, error responses, etc.
console . warn ( err )
}
const urlSha256 = crypto . createHash ( 'sha256' ) . update ( String ( request . url ( ) ) ) . digest ( 'hex' )
// const headersSha256 = crypto.createHash('sha256').update(String(request.headers())) // someday we may want to save headers hashes too
const truncated_url = ( method == 'DATA' ) ? request . url ( ) . slice ( 0 , 128 ) : request . url ( ) // don't duplicate bytes in data: urls (we already saved them in the file)
// this is essentially replicating the functionality of a WARC file, but in directory + index.jsonl form
await fs . promises . appendFile (
responses_log_path ,
JSON . stringify ( {
ts : timestamp ,
method ,
url : truncated_url ,
urlSha256 ,
postData : request.postData ( ) ,
response_url : ( ( method != 'DATA' ) && ( url . href != request . url ( ) ) ) ? url.href : undefined ,
status ,
resourceType ,
mimeType : resp_mimetype ,
responseSha256 ,
path : uniq_abspath?.replace ( dir , '.' ) ,
symlink_path : symlink_abspath?.replace ( dir , '.' ) ,
extension ,
} ) + '\n' ,
'utf-8' ,
)
}
} catch ( err ) {
// we should never throw hard errors here because there's nothing above us to catch it
// and we dont want to crash the entire CDP session / browser / main node process
console . warn ( '[❌] Error in response handler (set in startResponseSaving):' , err )
}
} ) ;
// handled by stopMetadataRecording():
// page.on('close', () => {
// page.off('response')
// })
}
function dedupeCookies ( cookies ) {
const len_before = cookies . length
const allowed_cookie_attrs = [ 'domain' , 'path' , 'name' , 'value' , 'expires' , 'sameSite' , 'sourceScheme' , 'url' , 'priority' , 'secure' , 'httpOnly' ]
const deduped_cookies = { }
for ( const cookie of cookies ) {
try {
const unique_id = ` ${ cookie . domain } ${ cookie . path } ${ cookie . name } `
deduped_cookies [ unique_id ] = {
. . . ( deduped_cookies [ unique_id ] || { } ) ,
. . . cookie ,
expires : 2147483640 , // max allowed expiry time (2038-01-18)
session : false , // make sure cookies dont expire at browser close time
secure : false , // make cookie restrictions more lax (for archiving scripts)
httpOnly : false , // make it easier to tamper with cookies from JS (for archiving scripts)
// "path": "/",
// "expires": 2147483641,
// "size": 194,
// "httpOnly": false,
// "secure": false,
// "session": false,
// "priority": "High",
// "sameParty": false,
// "sourceScheme": "Secure",
// "sourcePort": 443
// and more... https://pptr.dev/api/puppeteer.cookieparam
} as Cookie
if ( ! deduped_cookies [ unique_id ] . value ) {
delete deduped_cookies [ unique_id ]
continue
}
if ( deduped_cookies [ unique_id ] . name . startsWith ( '__' ) ) {
// cookies that start with __ must be secure, see https://github.com/puppeteer/puppeteer/issues/6806
deduped_cookies [ unique_id ] . secure = true
deduped_cookies [ unique_id ] . sourceScheme = 'Secure'
}
if ( deduped_cookies [ unique_id ] . domain . startsWith ( '.' ) ) {
deduped_cookies [ unique_id ] . sameParty = false
deduped_cookies [ unique_id ] . domain = deduped_cookies [ unique_id ] . domain . slice ( 1 )
}
for ( const key of Object . keys ( deduped_cookies [ unique_id ] ) ) {
if ( ! allowed_cookie_attrs . includes ( key ) ) {
delete deduped_cookies [ unique_id ] [ key ]
}
}
} catch ( err ) {
console . error ( '[❌] Failed to parse cookie during deduping' , cookie )
throw err
}
}
// console.log(`[🍪] Deduped ${len_before} cookies to ${Object.keys(deduped_cookies).length}...`)
return Object . values ( deduped_cookies ) as Cookie [ ]
}
async function loadCookiesTxt() {
const cookies = [ ] as Cookie [ ]
return cookies // write-only from chrome -> files for now
if ( fs . existsSync ( COOKIES_TXT_PATH ) ) {
// console.log(`[🍪] Loading cookies/localStorage/sessionStorage from ${COOKIES_TXT_PATH}...`)
// Read from to cookies.txt file using tough-cookie + @root/file-cookie-store
const cookies_store = new FileCookieStore ( COOKIES_TXT_PATH , { auto_sync : false , lockfile : false } ) ;
cookies_store . getAllCookiesAsync = util . promisify ( cookies_store . getAllCookies ) ;
const exported_cookies = await cookies_store . getAllCookiesAsync ( )
for ( const cookie of exported_cookies ) {
const cookie_from_tough = cookie . toJSON ( )
const domain = cookie_from_tough . hostOnly ? ` . ${ cookie_from_tough . domain } ` : cookie_from_tough . domain
const cookie_for_puppeteer : Cookie = {
domain ,
name : cookie_from_tough.key ,
path : cookie_from_tough.path ,
value : cookie_from_tough.value ,
secure : cookie_from_tough.secure || false ,
httpOnly : cookie_from_tough.httpOnly || false ,
session : false ,
expires : ( new Date ( cookie_from_tough . expires ) ) . valueOf ( ) / 1000 ,
size : undefined ,
}
// console.log('COOKIE_FROM_TOUGH_TXT', cookie_from_tough, cookie_for_puppeteer)
cookies . push ( cookie_for_puppeteer )
}
}
}
type AuthJSON = {
cookies : Cookie [ ] ,
sessionStorage : any ,
localStorage : any ,
}
async function loadAuthStorage ( page , { client } , { apply = true } = { } ) {
var {
cookies ,
sessionStorage ,
localStorage ,
} : AuthJSON = { cookies : [ ] , sessionStorage : { } , localStorage : { } }
if ( ! LOAD_AUTH_STORAGE ) {
// dont read auth from filesystem auth.json/cookies.txt, just rely on existing cookies in chrome profile
return { cookies , sessionStorage , localStorage }
}
if ( fs . existsSync ( COOKIES_TXT_PATH ) ) {
try {
cookies = await loadCookiesTxt ( )
} catch ( err ) {
console . warn ( '[⚠️] Loaded invalid cookies.txt, moved it to cookies.txt.corrupted (did two processes try to change it at the same time?)' )
await fs . promises . rename ( COOKIES_TXT_PATH , COOKIES_TXT_PATH + '.corrupted' )
}
// console.log(`[🍪] Loading cookies from cookies.txt...`, cookies.length)
}
if ( fs . existsSync ( AUTH_JSON_PATH ) ) {
try {
var {
cookies : auth_json_cookies ,
sessionStorage ,
localStorage ,
} = JSON . parse ( await fs . promises . readFile ( AUTH_JSON_PATH , 'utf-8' ) ) ;
cookies = [ . . . cookies , . . . auth_json_cookies ]
// console.log(`[🍪] Loading cookies from auth.json...`, auth_json_cookies.length)
} catch ( err ) {
console . warn ( '[⚠️] Loaded invalid auth.json, moved it to auth.json.corrupted (did two processes try to change it at the same time?)' )
await fs . promises . rename ( AUTH_JSON_PATH , AUTH_JSON_PATH + '.corrupted' )
}
}
cookies = dedupeCookies ( cookies )
if ( apply ) {
console . log ( ` [🍪] Loading stored cookies/localStorage/sessionStorage into session... ` , cookies . length )
// if (cookies?.length) {
// try {
// // try setting all at once first (much faster)
// await page.setCookie(...cookies)
// } catch(err) {
// // if any errors, fall back to setting one-by-one so that individual error can be caught
// for (const cookie of cookies) {
// try {
// await page.setCookie(cookie);
// } catch(err) {
// console.error('[❌] Failed to set cookie', cookie)
// throw err
// }
// }
// }
// }
const origin = await page . evaluate ( ( ) = > window . location . origin )
await page . evaluate ( ( savedSessionStorage ) = > {
for ( const [ key , value ] of Object . entries ( savedSessionStorage ) ) {
sessionStorage [ key ] = value ;
}
} , sessionStorage [ origin ] || { } ) ;
await page . evaluate ( ( savedLocalStorage ) = > {
for ( const [ key , value ] of Object . entries ( savedLocalStorage ) ) {
localStorage [ key ] = value ;
}
} , localStorage [ origin ] || { } ) ;
// origin/auth context changes when we do page.goto so we have to hook pageload and apply it then as well
// https://stackoverflow.com/questions/51789038/set-localstorage-items-before-page-loads-in-puppeteer
await page . evaluateOnNewDocument ( ( { sessionStorage , localStorage } ) = > {
const origin = window . location . origin ;
for ( const [ key , value ] of Object . entries ( sessionStorage [ origin ] || { } ) ) {
window . sessionStorage . setItem ( key , value as string )
}
for ( const [ key , value ] of Object . entries ( localStorage [ origin ] || { } ) ) {
window . localStorage . setItem ( key , value as string )
}
} , { sessionStorage , localStorage } ) ;
}
return { cookies , sessionStorage , localStorage }
}
async function loadCloudflareCookie ( page , { original_url } , { timeout = 20 _000 } = { } ) {
// make request to FlareSolverr server to get magic cookies that let us bypass cloudflare bot detection
// docker run -p 8191:8191 -e LOG_LEVEL=info ghcr.io/flaresolverr/flaresolverr
// alternatives if this stops working:
// - https://github.com/omkarcloud/botasaurus
// - https://github.com/ultrafunkamsterdam/nodriver
// - https://github.com/Akmal-CloudFreed/CloudFreed-CloudFlare-bypass
// - https://github.com/VeNoMouS/cloudscraper
const query = { url : original_url , cmd : "request.get" , maxTimeout : timeout }
try {
const response = await fetch ( FLARESOLVERR_API_ENDPOINT , {
method : 'POST' ,
headers : { 'Content-Type' : 'application/json' } ,
body : JSON.stringify ( query ) ,
} ) ;
const data = await response . json ( ) ;
const new_cookies = ( data ? . solution ? . cookies || [ ] ) . map ( cookie = > ( {
. . . cookie ,
'expires' : 2147483640 , // overwrite expiration to 32bit maximum timestamp (2038-01-18)
'secure' : false , // cookie value is plain text (not encrypted/encoded)
} ) )
if ( new_cookies . length ) {
console . log ( ` [☑️] Got Cloudflare bypass cookies ( ${ new_cookies . length } ) from FlareSolverr API... ` )
await page . setCookie ( . . . new_cookies ) ;
return new_cookies
} else {
const error_str = JSON . stringify ( data ? . message || data , null , 4 )
throw ` Bad FlareSolverr Response: ${ error_str } `
}
} catch ( error ) {
if ( JSON . stringify ( error ) . includes ( 'Challenge not detected' ) ) {
console . log ( '[☑️] Page is accessible without FlareSolverr Cloudflare bypass.' )
} else {
console . warn ( '[❌] Failed to get Cloudflare bypass cookies from FlareSolverr API.' , error )
}
}
return [ ]
}
async function setupURLRewriting ( page , page_state ) {
await page . setRequestInterception ( true ) ;
const rewrites = URL_REWRITES . sort ( ( a , b ) = > ( a . idx || 0 ) - ( b . idx || 0 ) )
page . on ( 'request' , interceptedRequest = > {
if ( interceptedRequest . isInterceptResolutionHandled ( ) ) return ;
const original_url = interceptedRequest . url ( )
// apply all the rewrites in order to the request URL
let url = original_url
for ( const rewrite of rewrites ) {
const new_url = url . replace ( rewrite . pattern , rewrite . replacement )
// console.log(rewrite, url, new_url)
// if url is rewritten to an emptystring, abort the request
if ( ! new_url ) {
console . warn ( '[🟥] Request blocked' , rewrite . pattern , ':' , url )
interceptedRequest . abort ( )
return
}
else if ( new_url && new_url != url ) {
// console.warn('[📳] Request rewritten', rewrite.pattern, rewrite.replacement, ':', url, '->', new_url)
console . warn ( '[📳] Request rewritten' , rewrite . pattern , ':' , new_url )
url = new_url
}
}
if ( url == original_url ) {
// if url is unchanged, continue request flow as-is
interceptedRequest . continue ( )
} else {
// otherwise redirect the browser to our rewritten version
interceptedRequest . respond ( {
status : 302 ,
headers : {
location : url ,
'x-redirect-by' : 'ArchiveBox.setupURLRewriting' ,
} ,
} )
}
} ) ;
// handled by stopMetadataRecording():
// page.on('close', () => {
// page.off('request')
// page.setRequestInterception(false)
// })
}
async function startMetadataRecording ( page , { original_url , version , client , traffic_log , console_log , redirects } ) {
// update helper state on page
page . _original_url = ( original_url || ( await page . url ( ) ) ) . toString ( )
// DEBUGGING: helpers for repl() debugging, dont rely on these (global state is badd mmkay)
// page._client = client || page._client || await page.target().createCDPSession()
// page._redirects = redirects
// page._traffic_log = traffic_log
// add initial entry to page redirect log
redirects [ original_url ] = {
idx : 0 ,
url : original_url ,
src : null ,
type : 'Initial' ,
wallTime : Date.now ( ) / 1000 ,
frameId : page.mainFrame ( ) . _id ,
requestId : null ,
initiator : { type : "user" } ,
isMainFrame : true ,
}
// DEBUGGING: record optional chrome debug trace with screenshots (heavy)
// try {
// await page.tracing.stop()
// await wait(200)
// } catch(err) {}
// try {
// await page.tracing.start({path: TRACE_PATH(page), screenshots: true});
// } catch(err) {}
let last_main_frame_url = original_url
// setup network request intercepts handler
const addCDPRequestDataListener = ( eventName ) = > {
client . on ( eventName , event = > {
try {
// save any HTTP/JS redirects to redirects for saveRedirects(page) to use later on
const new_url = event . documentURL
const http_status = event . redirectResponse ? . status || 0
const is_new_url = ( new_url !== original_url ) && ! redirects [ new_url ]
const is_main_frame_navigation = ( event . frameId == page . mainFrame ( ) . _id )
const is_http_redirect = ( 300 < http_status ) && ( http_status < 400 )
if ( new_url && is_new_url && ( is_main_frame_navigation || is_http_redirect ) && event . type == 'Document' ) {
const new_redirect_entry = {
url : new_url ,
src : event.redirectResponse?.url || last_main_frame_url ,
type : http_status || 'JS' ,
wallTime : Date.now ( ) / 1000 ,
frameId : event.frameId ,
requestId : event.requestId ,
initiator : event.initiator ,
idx : Object.keys ( redirects ) . length ,
isMainFrame : is_main_frame_navigation ,
}
redirects [ new_url ] = new_redirect_entry
if ( is_main_frame_navigation ) {
ALREADY_ARCHIVED . add ( new_redirect_entry . url . slice ( 0 , 4096 ) ) // we're already archiving this tab as it redirects, dont create a duplicate archive for the destination
console . warn ( ` [➡️] NAVIGATION[ ${ new_redirect_entry . type } ] ${ ANSI . blue } ${ last_main_frame_url } ${ ANSI . reset } \ n -> ${ ANSI . blue } ${ new_redirect_entry . url } ${ ANSI . reset } ` )
last_main_frame_url = new_url
}
}
if ( event . loaderId ) {
traffic_log [ event . loaderId ] = traffic_log [ event . loaderId ] || { } // make sure loader is also in requests list first
// sometimes it's not in the list if we start archiving too late / after a page's initial request was already made
}
// save to traffic_log as {8BC2087A2CCEF28017099C0E10E87440: {Network.eventWillBeSent: {eventId,loaderId, request|response, ...}}
// https://stackoverflow.com/questions/47078655/missing-request-headers-in-puppeteer?noredirect=1&lq=1
traffic_log [ event . requestId ] = traffic_log [ event . requestId ] || { }
Object . assign ( traffic_log [ event . requestId ] , { [ eventName ] : event } )
// DEBUGGING: log page visits and navigation events to console
// if (event?.response?.status) {
// // if we're expecting an HTML response, then we assume it's a page visit & log it to console
// const acceptMimeType = traffic_log[event.requestId]['Network.requestWillBeSentExtraInfo']?.headers?.accept
// if (acceptMimeType && acceptMimeType.includes('text/html')) {
// // log any HTML page responses (less noisy)
// console.log(`[>] GOT ${event.documentURL}: ${event.response.status} ${event.response.url} (${event.response.mimeType})`)
// } else {
// // log ALL responses, inclusing JS,CSS,Images,etc. (very noisy)
// // console.log(` > ${event.response.status} ${event.response.url} (${event.response.mimeType})`)
// }
// }
} catch ( err ) {
console . warn ( '[X] Error during request/response handler (startMetadataRecording.addCDPRequestDataListener)' )
console . warn ( err )
}
} )
}
addCDPRequestDataListener ( 'Network.requestWillBeSent' )
addCDPRequestDataListener ( 'Network.requestWillBeSentExtraInfo' )
addCDPRequestDataListener ( 'Network.responseReceived' )
addCDPRequestDataListener ( 'Network.responseReceivedExtraInfo' )
// clear any existing log entries
const consolelog_info = {
TYPE : 'console' ,
VERSION : version ,
URL : original_url ,
}
await overwriteFile ( CONSOLELOG_PATH ( page ) , JSON . stringify ( consolelog_info ) + '\n' )
// record console logs from page
const appendConsoleLog = async ( line ) = > {
if ( ! line ) return
console_log . push ( line )
await fs . promises . appendFile (
CONSOLELOG_PATH ( page ) ,
line + '\n' ,
'utf-8' ,
)
}
page . on ( 'console' , async ( message ) = >
await appendConsoleLog ( ` ${ message . type ( ) . toUpperCase ( ) } ${ message . location ( ) } ${ JSON . stringify ( message . text ( ) ) } ` ) )
page . on ( 'pageerror' , async ( error ) = >
await appendConsoleLog ( error . message || JSON . stringify ( error ) ) )
page . on ( 'requestfailed' , async ( request ) = >
await appendConsoleLog ( ` ${ request . failure ( ) ? . errorText } ${ request . url ( ) || JSON . stringify ( request ) } ` ) )
// set puppeteer options on page
await client . send ( 'Network.enable' ) // enable network tampering API
await client . send ( 'Emulation.clearDeviceMetricsOverride' ) ; // clear timing statistics
await client . send ( 'Page.setDownloadBehavior' , {
behavior : 'allow' ,
downloadPath : CHROME_DOWNLOADS_DIR ,
} )
// handled by stopMetadataRecording():
// page.on('close', () => {
// try {
// page.off('request')
// page.off('console')
// page.off('pageerror')
// page.off('requestfailed')
// page.setRequestInterception(false)
// } catch(err) {
// // some versions of puppeteer have had race conditions here where page is already closed by now
// console.warn('[X] Error in page close handler', err)
// }
// })
return { original_url , client , redirects , traffic_log , console_log }
}
async function stopMetadataRecording ( page , _page_state ) {
console . log ( '[🪝] Stopping CDP event hooks and request interception...' )
try {
page . off ( 'request' )
page . off ( 'response' )
page . off ( 'console' )
page . off ( 'pageerror' )
page . off ( 'requestfailed' )
page . off ( 'hashchange' )
page . setRequestInterception ( false )
// page.tracing.stop()
} catch ( err ) {
// some versions of puppeteer have had race conditions here where page is already closed by now
console . warn ( '[X] Error in page close handler' , err )
}
}
/********************** Human Behavior Emulation ******************************/
async function solveCaptchas ( page , page_state , { timeout = 90 _000 } = { } ) {
// using puppeteer-extra-plugin-recaptcha auto-solver
// await page.solveRecaptchas()
// using 2captcha-solver extension auto-solver
try {
// console.log('[🕑] Waiting for CAPTCHA to appear...')
await page . waitForSelector ( '.captcha-solver' , { timeout : 5_000 } )
console . log ( '[🤖] CAPTCHA challenge found, submitting to 2Captcha for solving...' )
await page . click ( '.captcha-solver' )
console . log ( ` [🧠] Waiting up to ${ timeout / 1000 } s for CAPTCHA to be solved... ` )
await page . waitForSelector ( ` .captcha-solver[data-state="solved"] ` , { timeout } )
console . log ( '[🔓] CAPTCHA solution retrieved from 2captcha.' )
} catch ( err ) {
console . log ( '[☑️] No CATPCHA challenges found, site thinks we are human.' )
}
}
async function jiggleMouse ( page , page_state , { timeout = 600 } = { } ) {
console . log ( ` [🐁] Moving mouse around randomly for ${ timeout / 1000 } s... ` )
const randomPoint = await getRandomPagePoint ( page )
const cursor = createCursor ( page , randomPoint , true )
cursor . toggleRandomMove ( true )
await wait ( timeout / 2 ) ;
await cursor . moveTo ( { x : DEFAULT_VIEWPORT.width / 2 , y : DEFAULT_VIEWPORT.height / 2 } ) ;
await wait ( timeout / 2 ) ;
cursor . toggleRandomMove ( false )
}
async function blockRedirects ( page , { original_url } ) {
page . on ( 'request' , req = > {
if ( req . isInterceptResolutionHandled ( ) ) return ;
// if it's a top-level navigation event to a new url
if ( req . isNavigationRequest ( ) && req . frame ( ) === page . mainFrame ( ) && req . url ( ) !== original_url ) {
req . abort ( 'aborted' ) ;
console . warn ( '[🟥] Blocked page attempt to naviage to new URL' , req . url ( ) )
} else {
req . continue ( ) ;
}
} ) ;
// handled by stopMetadataRecording():
// page.on('close', () => {
// page.off('request')
// page.setRequestInterception(false)
// })
await page . setRequestInterception ( true ) ;
}
async function blockJSExecution ( page , _page_state ) {
console . warn ( '[🟥] Stopping all JS execution on page...' )
await page . evaluate ( ( ) = > {
debugger ;
} )
// OR alternatively this (more buggy, breaks many sites):
// const html = await page.content();
// page.setJavaScriptEnabled(false);
// await page.setContent(html, { waitUntil: 'networkidle0' }); // 4
}
async function scrollDown ( page , _page_state , { timeout = 120 _000 , scroll_delay = SCROLL_DELAY , scroll_distance = SCROLL_DISTANCE , scroll_limit = SCROLL_LIMIT } = { } ) {
const starting_height = await page . evaluate ( 'document.body.scrollHeight' ) ;
let last_height = starting_height
let scroll_count = 0 ;
let scroll_position = scroll_count * scroll_distance
// await page.bringToFront()
// scroll to top
await page . evaluate ( ( ) = > { window . scrollTo ( { top : 0 , left : 0 , behavior : 'smooth' } ) ; } ) ;
while ( ( scroll_count < scroll_limit ) && ( ( scroll_delay * scroll_count ) < timeout ) ) {
console . log ( ` [⬇️] Scrolling down ${ scroll_count } x 1000px... ( ${ scroll_position } / ${ last_height } ) ` )
await page . evaluate ( ( y_offset ) = > { window . scrollTo ( { top : y_offset , left : 0 , behavior : 'smooth' } ) ; } , scroll_position ) ;
scroll_count ++
scroll_position = scroll_count * scroll_distance
// check if any new content was added / if we are infiniscrolling
let new_height = await page . evaluate ( 'document.body.scrollHeight' )
const added_px = new_height - last_height
if ( added_px > 0 ) {
console . log ( '[✚] Detected infini-scrolling...' , ` ${ last_height } + ${ added_px } => ${ new_height } ` )
} else if ( scroll_position >= new_height + scroll_distance ) {
// we've reached the bottom, condition isn't true until we've tried to go n+1 past the end (which is fine)
if ( scroll_count > 2 )
break
}
last_height = new_height
// sleep 2s, perform the smooth scroll down by 1000px, and increment the counter
await wait ( scroll_delay ) ;
// facebook watch pages infiniscroll (more and more recommendations forever), stop them after 3 pages
if ( page . _original_url . startsWith ( 'https://www.facebook.com/watch/?v' ) && scroll_count > 3 ) break
}
// scroll to bottom
if ( scroll_position < last_height ) {
await page . evaluate ( ( ) = > { window . scrollTo ( { top : document.body.scrollHeight , left : 0 , behavior : 'smooth' } ) ; } ) ;
await wait ( scroll_delay )
await page . evaluate ( ( ) = > { window . scrollTo ( { top : document.body.scrollHeight , left : 0 , behavior : 'smooth' } ) ; } ) ;
}
// Always wait an additional 2sec at the end for scroll animations / loading / rendering to settle down
console . log ( '[📉] Reached bottom of the page.' , ` ( ${ scroll_position } / ${ last_height } ) ` )
await wait ( scroll_delay ) ;
await page . evaluate ( ( ) = > { window . scrollTo ( { top : 0 , left : 0 , behavior : 'smooth' } ) ; } ) ;
await wait ( scroll_delay ) ;
return last_height
}
async function disableAnimations ( page , _page_state ) {
console . log ( ` [⛄️] Disabling all animations using CSS override... ` )
// https://stackoverflow.com/questions/53167644/injecting-css-into-site-with-puppeteer
const css_override = ` *, *::before, *::after {
- moz - animation : none ! important ;
- moz - transition : none ! important ;
animation : none ! important ;
transition : none ! important ;
caret - color : transparent ! important ;
} `
// inject override into current page
await page . addStyleTag ( { content : css_override } ) ;
// inject override into any subsequently navigated pages
await page . evaluateOnNewDocument ( ( css_override ) = > {
const style_tag = document . createElement ( 'style' )
style_tag . type = 'text/css'
style_tag . innerHTML = css_override
document . getElementsByTagName ( 'head' ) [ 0 ] . appendChild ( style_tag )
} , css_override ) ;
}
async function expandComments ( page , _page_state , { timeout = 120 _000 , limit = 15 _000 , delay = 650 } = { } ) {
console . log ( ` [🗃️] Expanding up to ${ limit } comments every ${ delay } ms... ` )
// expand all <details> sections in Github READMEs, HedgeDoc pages, etc.
await page . $ $eval ( 'pierce/article details' , elem = > { elem . open = true } ) // expand Github README details sections
await page . $ $eval ( 'pierce/div.js-discussion details:not(.details-overlay)' , elem = > { elem . open = true } ) // expand Github issue discussion hidden comments
await page . $ $eval ( 'pierce/.markdown-body details' , elem = > { elem . open = true } ) // expand HedgeDoc Markdown details sections
await page . exposeFunction ( 'onHashChange' , url = > page . emit ( 'hashchange' , url ) ) ;
await page . evaluateOnNewDocument ( ( ) = > {
// @ts-ignore
addEventListener ( 'hashchange' , ( e ) = > onHashChange ( location . href ) ) ;
} ) ;
// Listen for hashchange events in node Puppeteer code.
page . on ( 'hashchange' , url = > console . log ( 'Page tried to navigate to:' , new URL ( url ) ) ) ;
const num_expanded = await page . evaluate ( async ( { timeout , limit , delay } ) = > {
function getElementsByXPath ( xpath , ctx ? ) {
var results = [ ] ;
var xpathResult = document . evaluate (
xpath , // e.g. //*[text()='"+text+"']
ctx || document ,
null ,
XPathResult . ORDERED_NODE_ITERATOR_TYPE ,
null
) ;
var node ;
while ( ( node = xpathResult . iterateNext ( ) ) != null ) {
results . push ( node ) ;
}
return results ;
}
let num_expanded = 0
const getLoadMoreLinks = ( ) = > [
// find all the buttons/links to expand collapsed/hidden/lazy-loaded content
. . . document . querySelectorAll ( 'faceplate-partial[loading=action]' ) , // new reddit
. . . document . querySelectorAll ( 'a[onclick^="return morechildren"]' ) , // old reddit show more replies
. . . document . querySelectorAll ( 'a[onclick^="return togglecomment"]' ) , // old reddit show hidden replies
// ...document.querySelectorAll('a.js-show-link'), // stack overflow comments show more (TODO: make this only work on SO)
// ...document.querySelectorAll('a.morelink'), // HackerNews profile show more (TODO: make this only work on HN)
// ...getElementsByXPath("//*[text()~='View \d+ replies']"), // facebook comment expander
. . . getElementsByXPath ( "//*[text()='Show more replies']" ) , // twitter infiniscroll expander
. . . getElementsByXPath ( "//*[text()='Show replies']" ) , // twitter replies expander
]
const wait = ( ms ) = > new Promise ( res = > setTimeout ( res , ms ) )
let load_more_links = getLoadMoreLinks ( )
while ( load_more_links . length ) {
console . log ( 'Expanding comments...' , load_more_links . length )
for ( const link of load_more_links ) {
link . scrollIntoView ( { behavior : 'smooth' } )
if ( link . slot == 'children' ) {
continue
// patch new reddit "More replies" links that would open in a new window to display inline instead
// const comment_id = link.src.split('?')[0].split('/').at(-1)
// link.slot = `children-${comment_id}-0`
// link.__alwaysShowSlot = false
}
// click the "More replies" button
link . click ( )
num_expanded ++
await wait ( delay )
const time_elapsed = num_expanded * delay
if ( ( num_expanded > limit ) || ( time_elapsed > timeout ) )
return num_expanded
}
load_more_links = getLoadMoreLinks ( )
}
return num_expanded
} , { timeout , limit , delay } ) ;
page . off ( 'hashchange' )
if ( num_expanded ) {
console . log ( ` [🗃️] Expanded ${ num_expanded } comments... ` )
// scroll to bottom, then back up to top
const final_height = await page . evaluate ( 'document.body.scrollHeight' ) ;
await page . evaluate ( ( top ) = > { window . scrollTo ( { top , left : 0 , behavior : 'smooth' } ) ; } , final_height + 1000 ) ;
await wait ( delay ) ;
await page . evaluate ( ( ) = > { window . scrollTo ( { top : 0 , left : 0 , behavior : 'smooth' } ) ; } ) ;
await wait ( delay ) ;
}
}
async function submitForm ( page , _page_state , { timeout = 5 _000 } = { } ) {
try {
await page . waitForSelector ( 'form button[type=submit]' , { timeout : 1_500 } ) ;
console . log ( '[☑️] Submitting form...' )
await page . click ( 'form button[type=submit]' )
await page . waitForNavigation ( { timeout } ) ;
await page . goBack ( ) ;
} catch ( err ) {
// no form found
}
}
// TODO: add an evasion to set navigator.connection.rtt = 365 (0 = detectable as headless)
/******************************************************************************/
/******************************************************************************/
/**************** Extension-Based Archive Output Tasks ************************/
async function saveSinglefile ( page , { main_response , extensions } ) {
const extension = extensions . filter ( ( { name } ) = > name === 'singlefile' ) [ 0 ]
if ( ! extension . version ) throw 'Could not find Singlefile extension ID, is it installed?'
const url = await page . url ( ) || main_response . url ( )
if ( URL_SCHEMES_IGNORED . includes ( url . split ( ':' ) [ 0 ] ) ) return null
// get list of existing past files in downloads/* to ignore
const files_before = new Set (
( await fs . promises . readdir ( CHROME_DOWNLOADS_DIR ) )
. filter ( fn = > fn . endsWith ( '.html' ) )
) ;
const out_path = SINGLEFILE_PATH ( page )
console . log ( ` [🛠️] Saving Singlefile HTML using extension ( ${ extension . id } )... ` . padEnd ( 82 + 1 ) , prettyPath ( CHROME_DOWNLOADS_DIR ) )
await page . bringToFront ( ) // action button acts on the foreground tab, so it has to be in front :(
await extension . dispatchAction ( )
let files_new = [ ]
const check_delay = 3 _000
for ( const _try in [ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ] ) {
await wait ( check_delay )
const files_after = ( await fs . promises . readdir ( CHROME_DOWNLOADS_DIR ) ) . filter ( fn = > fn . endsWith ( '.html' ) ) ;
files_new = files_after . filter ( file = > ! files_before . has ( file ) )
if ( files_new . length == 0 ) {
// console.warn(` ...waiting for Singlefile to write HTML into ${CHROME_DOWNLOADS_DIR}...`)
continue
}
// iterate through new downloads and find a matching .html containing our page's URL in the header
for ( const file of files_new ) {
const dl_path = path . join ( CHROME_DOWNLOADS_DIR , file )
const dl_text = await fs . promises . readFile ( dl_path , 'utf-8' )
const dl_header = dl_text . split ( 'meta charset' ) [ 0 ]
if ( dl_header . includes ( ` url: ${ url } ` ) ) {
/// dont need this check anymore as now all output is versioned:
// if (fs.existsSync(out_path)) {
// const {size: existingSize} = await fs.promises.stat(out_path)
// const {size: newFileSize} = await fs.promises.stat(dl_path)
// if (newFileSize < existingSize) {
// console.log(`[🗑️] Discarding singlefile output (${file}) as it's smaller than existing ${out_path}...`)
// await fs.promises.rm(dl_path)
// return out_path
// }
// }
console . log ( ` [✍️] Moving Singlefile download from ${ file } ... ` . padEnd ( 82 ) , prettyPath ( out_path ) )
await fs . promises . rename ( dl_path , out_path )
return out_path
}
}
}
console . warn ( ` [❌] Couldn't find matching Singlefile HTML in ${ CHROME_DOWNLOADS_DIR } after waiting ${ ( check_delay * 10 ) / 1000 } s: ` , files_new . join ( ', ' ) )
return null
}
async function saveArchiveWebPage ( page , { extensions } , { timeout = 30 _000 } = { } ) {
// TODO: waiting on them to expose commands so we can generate .wacz easily
// https://github.com/webrecorder/archiveweb.page/issues/207
// ...
const browser = await page . browser ( )
const extension = extensions . filter ( ( { name } ) = > name === 'archivewebpage' ) [ 0 ]
await page . bringToFront ( )
await extension . dispatchPopup ( )
await extension . dispatchAction ( )
const popup = await browser . waitForTarget (
target = > target . url ( ) . toString ( ) . startsWith ( ` chrome-extension:// ${ extension . id } /popup.html ` ) ,
{ timeout : 5_000 } ,
)
await page . bringToFront ( )
// await puppeteer.Locator.race([
// popup.locator('::-p-aria(Start With Autopilot)'),
// popup.locator('wr-popup-viewer >>>> input'),
// popup.locator(':scope >>> input')
// ])
// .setTimeout(timeout)
// .click({
// offset: {
// x: 7.7265625,
// y: 7.203125,
// },
// });
// @ts-ignore
await puppeteer . Locator . race ( [
popup . locator ( 'wr-popup-viewer >>>> div.status-row > p' ) ,
popup . locator ( ':scope >>> div.status-row > p' ) ,
popup . locator ( '::-p-text(Recording: \n)' )
] ) . setTimeout ( timeout ) . click ( {
delay : 733.3000000007451 ,
offset : {
x : 293 ,
y : 13.5 ,
} ,
} )
await wait ( 8 _000 )
// @ts-ignore
await puppeteer . Locator . race ( [
popup . locator ( 'wr-popup-viewer >>>> div:nth-of-type(2) > button > span:nth-of-type(2)' ) ,
popup . locator ( ':scope >>> div:nth-of-type(2) > button > span:nth-of-type(2)' ) ,
popup . locator ( '::-p-text(Stop)' )
] ) . setTimeout ( timeout ) . click ( {
offset : {
x : 7.859375 ,
y : 23.203125 ,
} ,
} ) ;
return null
}
async function savePocket ( page , { extensions } ) {
const browser = await page . browser ( )
const extension = extensions . filter ( ( { name } ) = > name === 'pocket' ) [ 0 ]
if ( ! extension . version ) throw 'Could not find Pocket extension ID, is it installed?'
console . log ( ` [🛠️] Saving URL to Pocket API using extension ( ${ extension . id } )... ` , 'https://getpocket.com/saves' )
await page . bringToFront ( ) // action button acts on the foreground tab, so it has to be in front
await extension . dispatchAction ( )
try {
const login_window = await browser . waitForTarget (
target = > target . url ( ) . toString ( ) . startsWith ( 'https://getpocket.com/' ) ,
{ timeout : 3_000 } ,
)
// login window will open if pocket is not signed-in
if ( login_window ) return false
} catch ( e ) {
// no new window should open if it saves correctly
return true
}
}
/***************** Synchronous Archive Output Tasks ***************************/
async function saveScreenrecording ( page , page_state , { save_gif = true } = { } ) {
if ( page_state . recorder ) {
const duration = Date . now ( ) - page_state . start_ts
console . log ( ` [🎥] Saving screen-recording video ( ${ duration / 1000 } s)... ` . padEnd ( 82 ) , prettyPath ( SCREENRECORDING_PATH ( page ) ) )
const recorder = page_state . recorder
page_state . recorder = null
await recorder . stop ( )
// create symlink for legacy path
const snap_dir = page_state . snapshot_dir
const legacy_path = path . join ( snap_dir , 'media' , 'screenrecording.mp4' )
await overwriteSymlink ( SCREENRECORDING_PATH ( page ) , legacy_path , { relative : snap_dir , search_limit : snap_dir } )
// // remove duplicate frames (white frames at start while it loads + static image at end)
// const video_path = SCREENRECORDING_PATH(page)
// const short_path = video_path.replace('.mp4', '.short.mp4')
// try {
// await exec(
// // create a shortened video starting from 0:02s to 0:01s with duplicate frames removed (can look jumpy sometimes)
// `ffmpeg -ss 2 -sseof -1 -y -i ${video_path} -vf mpdecimate,setpts=N/FRAME_RATE/TB ${short_path}`
// )
// } catch(err) {
// console.log('[❌] Failed to shorten screenrecording.mp4')
// }
// convert video to GIF
if ( save_gif ) {
try {
const BIN_NAME = '/Volumes/NVME/Users/squash/bin/ffmpeg'
const child = child_process . spawn (
BIN_NAME ,
[
'-hide_banner' ,
'-loglevel' , 'error' ,
'-ss' , '3' ,
'-t' , '10' ,
'-y' ,
'-i' , SCREENRECORDING_PATH ( page ) ,
'-vf' , "fps=10,scale=1024:-1:flags=bicubic,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse" ,
'-loop' , '0' ,
SCREENRECORDGIF_PATH ( page ) ,
] ,
{
cwd : path.dirname ( SCREENRECORDING_PATH ( page ) ) ,
timeout : 60_000 ,
// stdio: [null, 'pipe', 'pipe'],
stdio : 'ignore' ,
detached : true , // run in background, don't block on response
} ,
)
await blockUntilExists ( SCREENRECORDGIF_PATH ( page ) , { min_bytes : 100 , timeout : 40_000 } )
console . log ( ` [🎥] Saved screen-recording GIF with ffmpeg pid= ${ child . pid } ( ${ duration / 1000 } s)... ` . padEnd ( 82 ) , prettyPath ( SCREENRECORDGIF_PATH ( page ) ) )
const snap_dir = page_state . snapshot_dir
const legacy_path = path . join ( snap_dir , 'media' , 'screenrecording.gif' )
await overwriteSymlink ( SCREENRECORDGIF_PATH ( page ) , legacy_path , { relative : snap_dir , search_limit : snap_dir } )
} catch ( err ) {
console . log ( '[❌] Failed to convert video to GIF:' , err )
}
}
return SCREENRECORDING_PATH ( page )
}
return null
}
async function saveScreenshot ( page , _page_state , { aspect_ratio = SCREENSHOT_ASPECT_RATIO , width = null , height = null , jpg_width = 1440 , jpg_quality = 90 , timeout = 30 _000 } = { } ) {
try { await fs . promises . unlink ( SCREENSHOT_PATH ( page ) ) } catch ( err ) { }
// setup width and height
width = width || DEFAULT_VIEWPORT . width
assert ( ( typeof width === 'number' ) && width > 200 )
height = height || Math . floor ( width / aspect_ratio )
assert ( ( typeof height === 'number' ) && height > 200 )
console . log ( ` [📸] Saving full-page screenshot ( ${ width } x ${ height } px)... ` . padEnd ( 82 ) , prettyPath ( SCREENSHOT_PATH ( page ) ) )
// set width, height, and deviceScale factor: https://github.com/puppeteer/puppeteer/issues/1576
await page . setViewport ( { . . . DEFAULT_VIEWPORT , width , height , deviceScaleFactor : 2 } )
await page . bringToFront ( )
await wait ( 1 _250 ) // page takes a sec settle after foregrounding and viewport update
// take lossless fullpage screenshot of 1920x1440+px (4:3+) -> ./screenshot.png
await page . screenshot ( { path : SCREENSHOT_PATH ( page ) , fullPage : true , type : 'png' } )
// wait for the screenshot to be created, then set the viewport to the next size
await blockUntilExists ( SCREENSHOT_PATH ( page ) , { min_bytes : 100 , timeout } )
await wait ( 6 _000 ) // puppeteer takes a while to finish writing png data when fullPage: true
const jpg_height = Math . floor ( jpg_width / aspect_ratio )
await page . setViewport ( { . . . DEFAULT_VIEWPORT , width : jpg_width , height : jpg_height , deviceScaleFactor : 2 } )
await wait ( 1 _250 ) // page takes a sec settle after foregrounding and viewport update
// WARNING: make sure you never try to create two screenshots at the same time (especially not fullpage screenshots)
// thats why there are all these delays here.
// screenshot creation messes up the whole viewport while it's running,
// and it writes bad/white empty screenshots if you try to make more than one concurrently
// take compressed screenshot of jpg_width*jpg_height (4:3) -> ./screenshot.jpg
await page . screenshot ( {
path : SCREENSHOT_JPG_PATH ( page ) ,
type : 'jpeg' ,
quality : jpg_quality ,
clip : {
x : 0 ,
y : 0 ,
width : jpg_width ,
height : jpg_height ,
} ,
captureBeyondViewport : false ,
} ) ;
await blockUntilExists ( SCREENSHOT_JPG_PATH ( page ) , { min_bytes : 100 , timeout : timeout / 2 } )
console . log ( ` [📸] Saved screenshot as screenshot.jpg ( ${ jpg_width } x ${ jpg_height } px)... ` . padEnd ( 82 ) , prettyPath ( SCREENSHOT_JPG_PATH ( page ) ) )
// reset viewport back to defaults
await wait ( 1 _250 )
await page . setViewport ( DEFAULT_VIEWPORT )
// ALTERNATIVE METHOD based on cropping fullpage png and converting to jpg manually:
// import {PNG} from 'pngjs';
// import jpeg from 'jpeg-js';
// setTimeout(async () => {
// try {
// const screenshot_png = SCREENSHOT_PATH(page);
// const screenshot_jpg = SCREENSHOT_JPG_PATH(page)
// const jpg_max_height = height
// const jpg_quality = quality; // Adjust the quality as needed (0-100)
// fs.createReadStream(screenshot_png)
// .pipe(new PNG())
// .on('parsed', function () {
// const width = this.width;
// const height = this.height;
// let cropped_height = height;
// if (height > jpg_max_height) {
// cropped_height = jpg_max_height;
// }
// const cropped_bytes = new Uint8Array(width * cropped_height * 4);
// for (let y = 0; y < cropped_height; y++) {
// for (let x = 0; x < width; x++) {
// const idx = (width * y + x) << 2;
// cropped_bytes[idx] = this.data[idx];
// cropped_bytes[idx + 1] = this.data[idx + 1];
// cropped_bytes[idx + 2] = this.data[idx + 2];
// cropped_bytes[idx + 3] = this.data[idx + 3];
// }
// }
// const jpeg_obj = {
// data: cropped_bytes,
// width: width,
// height: cropped_height,
// };
// const jpeg_bytes = jpeg.encode(jpeg_obj, jpg_quality);
// fs.writeFileSync(screenshot_jpg, jpeg_bytes.data);
// console.log(`[📸] Saved screenshot as screenshot.jpg (${width}x${jpg_max_height}px)...`.padEnd(82), prettyPath(SCREENSHOT_JPG_PATH(page)))
// });
// } catch(err) {
// console.error('[X] Error while generating JPG screenshot', SCREENSHOT_JPG_PATH(page), err)
// }
// }, DELAY_BEFORE_JPG_CONVERSION)
// ALTERNATIVE METHOD TO WRITE SCREENSHOT JPG:
// await wait(5_000) // puppeteer takes a while to finish writing png data when fullPage: true
// if ((await page.evaluate('document.body.scrollHeight')) > max_height) {
// // if page exceeds max_height, save additional cropped screenshot as screenshot.top.png
// // (needed b.c. uncropped screenshot may have insane 1:20+ aspect ratio that is hard to use elsewhere)
// await page.screenshot({ path: SCREENSHOT_JPG_PATH(page), type: 'jpg', quality: 100})
// await wait(1_000) // page takes a sec settle after a screenshot
// }
return SCREENSHOT_PATH ( page )
}
async function savePDF ( page , _page_state , { timeout = 30 _000 } = { } ) {
const url = page . url ( ) || 'about:blank'
if ( URL_SCHEMES_IGNORED . includes ( url . split ( ':' ) [ 0 ] ) ) return null
const out_path = PDF_PATH ( page )
console . log ( ` [📓] Saving print-as-PDF export... ` . padEnd ( 82 ) , prettyPath ( out_path ) )
await page . bringToFront ( )
try { await fs . promises . unlink ( PDF_PATH ( page ) ) } catch ( err ) { }
// await page.emulateMediaType('screen') // print as "@media(screen) instead of @media(print)"
// page.createPDFStream lets us to save larger PDFs than page.pdf() before crashing
// (streams to disk in chunks instead of all at once)
const pdf_stream = await page . createPDFStream ( {
timeout : timeout ,
printBackground : true ,
outline : true ,
tagged : true ,
format : 'A4' ,
displayHeaderFooter : false ,
// margin: { top: '0.5cm', right: '1cm', bottom: '0.8cm', left: '1cm' },
} )
const reader = pdf_stream . getReader ( )
// iterate through reader and append chunks to out_path
await fs . promises . rm ( out_path , { force : true } )
let num_bytes = 0
let error = '0 bytes written'
try {
while ( true ) {
const { done , value } = await reader . read ( )
if ( done ) break ;
await fs . promises . appendFile ( out_path , value )
num_bytes += value . length ;
}
} catch ( error ) {
num_bytes = 0
}
if ( ! num_bytes ) {
console . warn ( '[❌] Failed to save PDF' , JSON . stringify ( error , null , 4 ) )
await fs . promises . rm ( out_path , { force : true } )
return null
}
return out_path
}
async function inlineShadowDOM ( page , _page_state , { limit = 100 _000 } = { } ) {
console . log ( ` [😎] Replacing Shadow DOM elements with inline HTML... ` )
try {
const num_replaced = await page . evaluate ( ( limit ) = > {
let num_replaced = 0
// Returns HTML of given shadow DOM.
const getShadowDomHtml = ( shadowRoot ) = > {
let shadowHTML = '' ;
for ( const el of shadowRoot . childNodes ) {
shadowHTML += el . nodeValue || el . outerHTML ;
}
return shadowHTML ;
} ;
// Recursively replaces shadow DOMs with their HTML.
const replaceShadowDomsWithHtml = ( rootElement ) = > {
if ( num_replaced > limit ) return
for ( const el of rootElement . querySelectorAll ( '*' ) ) {
if ( el . shadowRoot ) {
replaceShadowDomsWithHtml ( el . shadowRoot ) ;
el . innerHTML += getShadowDomHtml ( el . shadowRoot ) ;
}
}
num_replaced ++
} ;
replaceShadowDomsWithHtml ( document . body ) ;
return num_replaced
} , limit )
// console.log(' √ replaced', num_replaced, 'Shadow DOM trees')
} catch ( err ) {
console . log ( '[⚠️] Inlining Shadow DOM failed' , err )
}
}
async function saveAIQualityAssuranceResult ( page , { original_url , version } ) {
console . log ( ` [🧠] Analyzing screenshot with GPT-4o for QA checks... ` . padEnd ( 82 ) , prettyPath ( AIQA_PATH ( page ) ) )
let screenshot_path = SCREENSHOT_PATH ( page )
const screenshot_cropped_path = SCREENSHOT_JPG_PATH ( page )
if ( fs . existsSync ( screenshot_cropped_path ) ) {
// screenshot is too tall to pass to openai, send cropped version instead
screenshot_path = screenshot_cropped_path
}
try {
await blockUntilExists ( screenshot_path , { min_bytes : 100 , timeout : 7_500 } )
} catch ( err ) {
console . warn ( '[❌] Failed to send screenshot to GTP-4o for analysis, no screenshot.{png,jpg} exists' , err )
return null
}
var stdout = ''
var stderr = ''
let result = null
const PYTHON_BIN = path . join ( __dirname , '.venv/bin/python' )
const SCRIPT_PATH = path . join ( __dirname , 'ai_qa.py' )
await blockUntilExists ( PYTHON_BIN , { min_bytes : 1 , timeout : 250 } )
await blockUntilExists ( SCRIPT_PATH , { min_bytes : 1 , timeout : 250 } )
try {
var { stdout , stderr } = await exec (
` ${ PYTHON_BIN } ${ SCRIPT_PATH } --attach ' ${ screenshot_path } ' `
)
result = JSON . parse ( stdout . toString ( ) )
if ( ! result ) throw 'Got empty result!'
result = {
TYPE : 'aiqa' ,
VERSION : version ,
URL : original_url ,
. . . result ,
}
} catch ( parse_err ) {
console . warn ( '[❌] Failed to get OpenAI analysis for screenshot.png' , parse_err , stderr )
}
if ( ! ( result || stdout ) ) {
return null
}
await overwriteFile (
AIQA_PATH ( page ) ,
result || stdout . toString ( ) ,
)
return result
}
async function saveYTDLP ( page , { original_url , version } , { max_size = '750m' } = { } ) {
console . log ( ` [🎥] Saving media with YT-DLP (<= ${ max_size } )... ` . padEnd ( 82 ) , prettyPath ( YTDLP_PATH ( page ) ) )
await fs . promises . mkdir ( YTDLP_PATH ( page ) , { recursive : true } )
const cwd = YTDLP_PATH ( page )
const bin_name = 'yt-dlp'
const timeout = 300 _000 // 5min timeout
const args = [
'--restrict-filenames' ,
'--trim-filenames' , '128' ,
'--write-description' ,
'--write-info-json' ,
'--write-annotations' ,
'--write-thumbnail' ,
'--no-call-home' ,
'--write-sub' ,
'--write-auto-subs' ,
'--convert-subs=srt' ,
'--yes-playlist' ,
'--continue' ,
'--no-abort-on-error' ,
'--ignore-errors' ,
'--geo-bypass' ,
'--add-metadata' ,
` --format=(bv*+ba/b)[filesize<= ${ max_size } ][filesize_approx<=? ${ max_size } ]/(bv*+ba/b) ` ,
'--no-check-certificate' ,
'--no-progress' ,
// `--cookies=${COOKIES_TXT_PATH}`, // using logged in cookies actually makes it fail more often, not sure why
original_url ,
]
const { getResult , . . . exec_info } = await saveExecResult ( bin_name , args , { original_url , version } , { cwd , timeout } )
return { getResult , . . . exec_info }
}
async function saveGALLERYDL ( page , { original_url , version } ) {
console . log ( ` [🎥] Saving photos with gallery-dl... ` . padEnd ( 82 ) , prettyPath ( GALLERYDL_PATH ( page ) ) )
await fs . promises . mkdir ( GALLERYDL_PATH ( page ) , { recursive : true } )
const cwd = GALLERYDL_PATH ( page )
const bin_name = 'gallery-dl'
const timeout = 300 _000 // 5min timeout
const args = [
'--verbose' ,
'--write-metadata' ,
'--write-infojson' ,
'--write-tags' ,
'--sleep=1.5-2.5' ,
` --cookies= ${ COOKIES_TXT_PATH } ` ,
// '--no-check-certificate',
// `--directory=media`,
original_url ,
]
const { getResult , . . . exec_info } = await saveExecResult ( bin_name , args , { original_url , version } , { cwd , timeout } )
return { getResult , . . . exec_info }
}
// async function saveWget(page, {original_url, version}) {
// console.log(`[⎒] Saving wget site clone...`.padEnd(82), prettyPath(WGET_PATH(page)))
// const args = [
// // ...
// ]
// spawn(
// 'wget',
// [
// ...args,
// original_url,
// ],
// {
// cwd: WGET_PATH(page),
// detached: true, // run in background, don't block on response
// stdio: 'ignore',
// timeout: 300_000, // 5min timeout
// },
// )
// return {path: WGET_PATH(page)}
// }
/**************** Asynchronous Archive Output Tasks ***************************/
type FaviconCandidate = {
url : string ,
basename : string ,
extension : string ,
expected_mimetype : string ,
}
const faviconFromDomain = ( url ) = > {
// https://auth:pass@t.co:1234/a/bc123 -> https://auth:pass@t.co:1234/favicon.ico
const url_origin = ( new URL ( url ) ) . origin
return {
url : url_origin ? ` ${ url_origin } /favicon.ico ` : null ,
basename : 'favicon' ,
extension : undefined , // auto-detect extension at download time in case it redirects us to a png
expected_mimetype : 'image/' , // only accept image/* to avoid saving html/txt error reponses as icon
} as FaviconCandidate
}
const faviconFromGoogle = ( url , size = 256 ) = > {
// https://auth:pass@t.co:1234/a/bc123 -> https://www.google.com/s2.favicons?domain=t.co
const domain = url && ( new URL ( url ) ) . hostname
return {
url : domain?.includes ( '.' ) ? ` https://www.google.com/s2/favicons?sz= ${ size } ,domain= ${ domain } ` : null ,
basename : 'google_favicon' ,
extension : 'png' ,
expected_mimetype : 'image/png' , // google always provides PNGs in response
} as FaviconCandidate
}
const faviconFromHtml = async ( page ) = > {
// <link rel="icon" src="https://example.com/static/images/favicon.png"/> -> https://example.com/static/images/favicon.png
let url
try {
url = await page . $eval ( 'link[rel*="icon"]' , ( elem ) = > elem ? . href )
if ( ! url || ! url . includes ( '://' ) )
url = null
} catch ( err ) {
url = null
// console.warn('Failed to find favicon tag in html', JSON.stringify(err, null, 4))
}
return {
url ,
basename : 'favicon' ,
extension : undefined , // auto-detect extension at download time
expected_mimetype : 'image/' , // accept any image/* mimetype at download time
} as FaviconCandidate
}
type FaviconResult = {
url : string ,
num_bytes : number ,
abspath? : string ,
dir? : string ,
filename? : string ,
mimeType? : string ,
}
async function saveFavicon ( page , { original_url , main_response , version } ) {
const dir = path . dirname ( FAVICON_PATH ( page ) )
const response_url = main_response ? . url ( )
const favicon_downloads_to_try : { [ key : string ] : FaviconCandidate } = unique ( [
await faviconFromHtml ( page ) ,
faviconFromDomain ( response_url ) ,
faviconFromDomain ( original_url ) ,
faviconFromGoogle ( response_url ) ,
faviconFromGoogle ( original_url ) ,
] . filter ( ( { url } ) = > url ) , 'url' )
const browser = await page . browser ( )
// let logs = []
// let errors = []
let output_files : { [ key : string ] : FaviconResult } = { }
for ( const download_options of Object . values ( favicon_downloads_to_try ) ) {
let result : FaviconResult = { num_bytes : 0 , url : download_options.url }
// {url, num_bytes, abspath, dir, filename, basename, extension, mimeType}
try {
// try getting it with node-fetch first
const response = await fetch ( download_options . url ) as Response
const file_options = await detectFilename ( { . . . download_options , response , dir } )
if ( response . headers . get ( "content-length" ) ) {
const favicon_stream = Readable . fromWeb ( response . body as any )
await overwriteFile ( file_options . abspath , favicon_stream )
result = {
. . . file_options ,
num_bytes : parseInt ( response . headers . get ( "content-length" ) || '0' ) ,
mimeType : response.headers.get ( "content-type" ) ,
}
} else {
throw 'Failed to download favicon with fetch()'
}
} catch ( err ) {
// console.warn('[!] Failed to get favicon with node-fetch', err)
// fallback to getting it by opening a new browser tab
result = await download ( { . . . download_options , browser , dir , page } )
}
// logs.push(...(result.logs || []))
// errors.push(...(result.errors || []))
if ( result . num_bytes ) {
console . log ( ` [🌠] Saving page favicon ( ${ result . url . substring ( 0 , 35 ) } ... ${ result . mimeType } )... ` . padEnd ( 82 ) , prettyPath ( result . abspath ) )
output_files [ result . filename ] = result
break // break here stops after the first successful download, comment out to keep going instead
}
}
const output_file = Object . values ( output_files ) . sort ( file = > file . num_bytes ) . at ( - 1 )
const favicon_info = {
TYPE : 'favicon' ,
VERSION : version ,
URL : original_url ,
succeeded : ! ! output_file ,
// stdout: JSON.stringify(logs),
// stderr: JSON.stringify(errors),
favicon_url : output_file?.url ,
favicon_urls : Object.keys ( favicon_downloads_to_try ) ,
favicon_files : Object.keys ( output_files ) . map ( fname = > fname . replace ( dir , '.' ) ) ,
favicon_filename : output_file?.filename ,
favicon_num_bytes : output_file?.num_bytes ,
}
await overwriteFile ( FAVICON_PATH ( page ) , favicon_info )
return favicon_info
}
async function saveTitle ( page , { original_url , version } ) {
const title_from_browser = ( await page . title ( ) ) || null
const title_from_js = await page . evaluate ( ( ) = > document ? . title || null )
const title_from_html = await page . evaluate ( ( ) = > document ? . querySelector ( 'title' ) ? . innerText || null )
const title_from_og = await page . evaluate ( ( ) = > document ? . querySelector ( 'meta[property="og:title"]' ) ? . getAttribute ( 'content' ) || null )
// best guess at best title = longest title
const title = ( [ title_from_html , title_from_og , title_from_js , title_from_browser ]
. filter ( title = > title )
. sort ( ( a , b ) = > b . length - a . length ) [ 0 ] || '' )
. replaceAll ( '\n' , ' ' )
if ( title ? . length ) {
console . log ( ` [📗] Saving page title ( ${ title . substring ( 0 , 40 ) } )... ` . padEnd ( 82 ) , prettyPath ( TITLE_PATH ( page ) ) )
await overwriteFile ( TITLE_PATH ( page ) , title )
}
const title_info = {
TYPE : 'title' ,
VERSION : version ,
URL : original_url ,
title ,
title_from_html ,
title_from_og ,
title_from_js ,
title_from_browser ,
}
const title_json_path = TITLE_PATH ( page ) . replace ( '.txt' , '.json' )
await overwriteFile ( title_json_path , title_info )
return title_info
}
async function saveRaw ( page , { main_response } ) {
const response = main_response
if ( ! response ) {
console . warn ( '[⚠️] Failed to save page RAW bytes, main_response is null' , response )
}
const dir = RAW_PATH ( page )
await fs . promises . mkdir ( dir , { recursive : true } )
const { url , abspath , mimeType } = await detectFilename ( { page , response , dir } )
console . log ( ` [🔟] Saving raw response bytes ( ${ mimeType } )... ` . padEnd ( 82 ) , prettyPath ( abspath ) )
await download ( { page , response , abspath } )
return abspath
}
async function saveSourceMaps ( page , { original_url , version } ) {
console . log ( ` [🐛] Saving source maps to ./responses/all/*.{js,css}.map... ` )
const response_index_path = path . join ( RESPONSES_PATH ( page ) , 'index.jsonl' )
const response_index = await fs . promises . readFile ( response_index_path , 'utf-8' )
const urls_to_download = [ ]
for ( const response of response_index . split ( '\n' ) ) {
try {
const { url , extension } = JSON . parse ( response )
if ( [ 'css' , 'js' ] . includes ( extension ? . toLowerCase ( ) ) ) {
urls_to_download . push ( url + '.map' )
}
} catch ( err ) { continue }
}
// TODO: fix this, it needs to both after stopSavingMetadata and before stopSavingMetadata
// fix is to use traffic_log to get response url list instead of waiting for index.jsonl to be created
await page . evaluate ( async ( urls_to_download ) = > {
const promises = [ ]
for ( const sourcemap_url in urls_to_download ) {
promises . push ( fetch ( sourcemap_url ) )
}
return Promise . allSettled ( promises )
} , urls_to_download )
return {
TYPE : 'sourcemaps' ,
URL : original_url ,
VERSION : version ,
sourcemaps : urls_to_download ,
}
}
async function saveRequests ( page , { original_url , version , traffic_log } ) {
console . log ( ` [📼] Saving requests log ( ${ Object . keys ( traffic_log ) . length } )... ` . padEnd ( 82 ) , prettyPath ( REQUESTS_PATH ( page ) ) )
const requests_info = {
TYPE : 'requests' ,
VERSION : version ,
URL : original_url ,
requests : traffic_log ,
}
await overwriteFile ( REQUESTS_PATH ( page ) , requests_info )
return requests_info
}
async function saveRedirects ( page , { original_url , main_response , traffic_log , redirects , version } ) {
const main_request_id = Object . keys ( traffic_log ) . filter ( id = > ! id . includes ( '.' ) ) [ 0 ]
const main_response_traffic = traffic_log [ main_request_id ] || { }
const url_from_browser = await page . url ( ) || null
const url_from_request = (
main_response ? . request ( ) ? . url ( )
|| main_response_traffic [ 'Network.requestWillBeSent' ] ? . request ? . url
|| null )
const url_from_response = (
main_response ? . url ( )
|| main_response_traffic [ 'Network.responseReceived' ] ? . main_response ? . url
|| null )
const http_redirects =
Object . values ( traffic_log )
. filter ( event = > event [ 'Network.requestWillBeSent' ] ? . redirectResponse )
. map ( event = > event [ 'Network.requestWillBeSent' ] )
. map ( requestWillBeSent = > ( {
url : requestWillBeSent.request.url ,
src : requestWillBeSent.redirectResponse.url ,
status : requestWillBeSent.redirectResponse.status ,
loaderId : requestWillBeSent.loaderId ,
requestId : requestWillBeSent.requestId ,
wallTime : requestWillBeSent.wallTime ,
initiator : requestWillBeSent.initiator ,
isMainFrame : ( requestWillBeSent . loaderId == main_request_id ) ,
} ) )
const url_parsed = new URL ( url_from_response || url_from_request || url_from_browser )
const redirects_info = {
TYPE : 'redirects' ,
VERSION : version ,
URL : original_url ,
url_parsed ,
url_from_request ,
url_from_response ,
url_from_browser ,
redirects_from_browser : redirects ,
redirects_from_http : http_redirects ,
}
console . log ( ` [🔗] Saving page redirects log ( ${ http_redirects . length } )... ` . padEnd ( 82 ) , prettyPath ( REDIRECTS_PATH ( page ) ) )
await overwriteFile ( REDIRECTS_PATH ( page ) , redirects_info )
return redirects_info
}
async function saveHeaders ( page , { original_url , version , traffic_log } ) {
const main_request_id = Object . keys ( traffic_log ) . filter ( id = > ! id . includes ( '.' ) ) [ 0 ]
const main_response_traffic = traffic_log [ main_request_id ] || { }
// combine base request with browser-added request headers
const request = { . . . main_response_traffic [ 'Network.requestWillBeSent' ] ? . request }
const request_extra_headers = main_response_traffic [ 'Network.requestWillBeSentExtraInfo' ] ? . headers || { }
request . headers = { . . . request . headers , . . . request_extra_headers }
// combine base response with browser-added response headers
const response = { . . . main_response_traffic [ 'Network.responseReceived' ] ? . response }
const response_extra_headers = main_response_traffic [ 'Network.responseReceivedExtraInfo' ] ? . headers || { }
response . headers = { . . . response . headers , . . . response_extra_headers }
const headers_info = {
TYPE : 'headers' ,
VERSION : version ,
URL : original_url ,
request ,
response ,
}
const num_headers = Object . keys ( { . . . request . headers , . . . response . headers } ) . length
if ( num_headers ) {
console . log ( ` [👾] Saving main request & response headers ( ${ num_headers } )... ` . padEnd ( 82 ) , prettyPath ( HEADERS_PATH ( page ) ) )
await overwriteFile ( HEADERS_PATH ( page ) , headers_info )
}
return headers_info
}
async function saveSSL ( page , { original_url , version , traffic_log } ) {
const main_request_id = Object . keys ( traffic_log ) . filter ( id = > ! id . includes ( '.' ) ) [ 0 ]
const main_response_traffic = traffic_log [ main_request_id ] || { }
const relevant_response_keys = [
'url' ,
'status' ,
'mimeType' ,
'connectionReused' ,
'remoteIPAddress' ,
'remotePort' ,
'fromServiceWorker' ,
'encodedDataLength' ,
'protocol' ,
'alternateProtocolUsage' ,
'securityState' ,
'securityDetails' ,
]
let ssl_info = Object . entries ( main_response_traffic [ 'Network.responseReceived' ] ? . response || { } )
. reduce ( ( obj , [ key , val ] ) = > {
if ( relevant_response_keys . includes ( key ) ) {
obj [ key ] = val
}
return obj
} , { } ) as any
// TODO: parse SSL certificate sha256 hash from chrome://system/#chrome_root_store
// const ssl_certificate = await client.send('Network.getCertificate', {origin: original_url})
// ssl_info.sslCertSha256 = '<unknown>'
ssl_info = {
TYPE : 'ssl' ,
VERSION : version ,
URL : original_url ,
. . . ssl_info ,
}
if ( Object . keys ( ssl_info ) . length - 3 ) {
console . log ( ` [🔏] Saving page SSL details ( ${ ssl_info ? . securityDetails ? . protocol } )... ` . padEnd ( 82 ) , prettyPath ( SSL_PATH ( page ) ) )
await overwriteFile ( SSL_PATH ( page ) , ssl_info )
}
return ssl_info
}
async function saveDOM ( page , { original_url , version } ) {
const html = await page . content ( ) ;
console . log ( ` [📖] Saving DOM dump ( ${ html . length } )... ` . padEnd ( 82 ) , prettyPath ( DOM_PATH ( page ) ) )
const html_with_header =
` <!-- Saved by ArchiveBox TYPE=dom VERSION= ${ version } URL= ${ original_url } --> \ n ${ html } `
await overwriteFile ( DOM_PATH ( page ) , html_with_header )
return DOM_PATH ( page )
}
async function saveBodyText ( page , _page_state ) {
const innerText = await page . evaluate ( ( ) = > document ? . body ? . innerText ) ;
if ( innerText ? . length ) {
console . log ( ` [📃] Saving body text ( ${ innerText . length } )... ` . padEnd ( 82 ) , prettyPath ( BODYTEXT_PATH ( page ) ) )
await overwriteFile ( BODYTEXT_PATH ( page ) , innerText )
}
// // alternative method: emulate Ctrl+A, Ctrl+C (sometimes gets more than body.innerText)
// const innerText = await page.$eval('*', (el) => {
// const selection = window.getSelection();
// const range = document.createRange();
// range.selectNode(el);
// selection.removeAllRanges();
// selection.addRange(range);
// return window.getSelection().toString();
// });
return innerText
}
async function savePandoc ( page , { original_url , version } ) {
console . log ( ` [📒] Converting DOM HTML to markdown with Pandoc... ` . padEnd ( 82 ) , prettyPath ( PANDOC_PATH ( page ) ) )
let dom_paths = [ DOM_PATH ( page ) , SINGLEFILE_PATH ( page ) ] . filter ( fs . existsSync )
if ( ! dom_paths ) return null
const dom_path = dom_paths [ 0 ]
var stdout : string = ''
var stderr : string = ''
let result : any = null
const BIN_NAME = 'pandoc'
// pandoc --from html --to markdown_github --citeproc --wrap=none --highlight-style=kate
const args = [
BIN_NAME ,
'--from=html' ,
'--to=markdown_github' ,
'--wrap=none' ,
'--citeproc' ,
'--highlight-style=kate' ,
` --output=' ${ PANDOC_PATH ( page ) } ' ` ,
dom_path ,
]
try {
; ( { stdout , stderr } = await exec ( args . join ( ' ' ) ) ) ;
stdout = stdout . toString ( ) . trim ( )
if ( ! stdout ) throw 'Got empty result!'
result = {
TYPE : 'pandoc' ,
VERSION : version ,
URL : original_url ,
cmd : args ,
markdown_file : PANDOC_PATH ( page ) ,
}
} catch ( parse_err ) {
console . warn ( '[❌] Failed to run Pandoc HTML to MD conversion' , parse_err , stderr )
}
if ( ! stdout ) { return null }
await overwriteFile (
PANDOC_PATH ( page ) ,
stdout ,
)
// pandoc --from markdown_github --to html --citeproc --wrap=none --highlight-style=kate
const reverse_conversion_args = [
BIN_NAME ,
'--from=markdown_github' ,
'--to=html' ,
'--wrap=none' ,
'--citeproc' ,
'--highlight-style=kate' ,
` --output=' ${ PANDOC_PATH ( page ) . replace ( '.md' , '.html' ) } ' ` ,
PANDOC_PATH ( page ) ,
]
try {
; ( { stdout , stderr } = await exec ( reverse_conversion_args . join ( ' ' ) ) ) ;
stdout = stdout . toString ( ) . trim ( )
if ( ! stdout ) throw 'Got empty result!'
result = {
. . . result ,
html_file : PANDOC_PATH ( page ) . replace ( '.md' , '.html' ) ,
}
} catch ( parse_err ) {
console . warn ( '[❌] Failed to run Pandoc MD to HTML conversion' , parse_err , stderr )
}
if ( ! result ) { return null }
await overwriteFile (
PANDOC_PATH ( page ) . replace ( '.md' , '.html' ) ,
result ,
)
return result
}
async function saveReadability ( page , { original_url , version } ) {
const url = await page . url ( )
let html = ''
let article = null
try {
html = await page . content ( )
if ( html . length > 14 _000_000 ) {
console . warn ( '[⚠️] Truncating readability article text because html is too long...' , html . length )
html = html . substring ( 0 , 13 _900_000 )
}
const virtualConsole = new VirtualConsole ( )
const dom = new JSDOM ( html , { url , virtualConsole } )
const reader = new Readability ( dom . window . document ) ;
article = reader . parse ( )
} catch ( err ) {
console . warn ( ` [❌] Failed to get readability article text ` )
return null
}
if ( article ) {
console . log ( ` [📜] Saving readability article text ( ${ article . textContent ? . length } )... ` . padEnd ( 82 ) , prettyPath ( READABILITY_PATH ( page ) ) )
const { content , textContent , . . . metadata } = article
if ( content . trim ( ) ) {
await overwriteFile ( READABILITY_PATH ( page ) . replace ( '.json' , '.html' ) , content ) ;
}
if ( textContent . trim ( ) ) {
await overwriteFile ( READABILITY_PATH ( page ) . replace ( '.json' , '.txt' ) , textContent ) ;
}
const readability_info = {
TYPE : 'readability' ,
VERSION : version ,
URL : original_url ,
. . . metadata ,
}
await overwriteFile ( READABILITY_PATH ( page ) , readability_info )
return readability_info
}
return null
}
async function saveAccessibility ( page , { original_url , version } ) {
// get accessibility tree
const accessibility_tree = await page . accessibility . snapshot ( { interestingOnly : true } ) ;
// console.log(accessibility_tree);
// get iframe tree
const iframes = [ ]
function dumpFrameTree ( frame , indent = '>' ) {
iframes . push ( indent + frame . url ( ) ) ;
for ( const child of frame . childFrames ( ) ) {
dumpFrameTree ( child , indent + '>' ) ;
}
}
dumpFrameTree ( page . mainFrame ( ) , '' ) ;
// console.log(iframes)
// generate simple table-of-contents of all the key html elements (e.g. h1, h2, h3, article, main, etc.)
const outline = await page . evaluate ( ( ) = > {
const headings = [ ]
for ( const elem of [ . . . document . querySelectorAll ( "h1, h2, h3, h4, h5, h6, a, header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe" ) ] as HTMLElement [ ] ) {
// skip a tags that aren't named anchors
if ( elem . tagName . toLowerCase ( ) == 'a' && ! ( elem as HTMLAnchorElement ) . name ) continue
// e.g. article #main-article
const elem_id = ( ( typeof elem . id === 'string' && elem . id ) || ( elem as HTMLAnchorElement ) . name || elem . ariaLabel || elem . role || '' )
const elem_classes = elem . className . trim ( ) . split ( ' ' ) . slice ( 0 , 3 ) . join ( ' .' ) || ''
const elem_action = ( elem as any ) . action ? . split ( '/' ) ? . slice ( - 1 ) ? . join ( '/' )
const summary = elem . innerText . length > 128
? ` ${ elem . innerText ? . slice ( 0 , 128 ) } ... `
: elem . innerText
let prefix = ''
let title = ( elem_id ? ` # ${ elem_id } ` : '' )
if ( ! title && elem_classes ) title = ` . ${ elem_classes } `
if ( elem_action ) title = ` ${ title } / ${ elem_action } `
if ( summary ) title = ` ${ title } : ${ summary } `
// if elem is a header, prepend a #### prefix based on its level
const level = Number ( elem . tagName . toLowerCase ( ) . replace ( 'h' , '' ) )
if ( ! isNaN ( level ) ) {
prefix = '#' . repeat ( level )
title = elem . innerText || elem_id || elem_classes
} else {
// set prefix to element's breadcrumb path
let node = elem
const parents = [ elem . tagName ? . toLowerCase ( ) . trim ( ) ]
while ( node ) {
// add each parent element's name to the path
// const elem_type = node.tagName?.toLowerCase().trim() || ''
// if (elem_type && !['div', 'span', 'p', 'body', 'html'].includes(elem_type)) {
// parents.unshift(elem_type);
// }
parents . unshift ( '' ) // add emptystring to abbreviate path as >>>> istead of main>article>header>div>...
node = node . parentNode as HTMLElement
}
prefix = parents . join ( '>' )
}
// strip all repeated whitespace and newlines
title = title . replaceAll ( '\n' , ' ' ) . replace ( /\s+/g , ' ' ) . trim ( )
if ( prefix ) {
headings . push ( ` ${ prefix } ${ title } ` )
}
}
// console.log(headings.join('\n'))
return headings
} )
console . log ( ` [🩼] Saving accessibility outline ( ${ Object . keys ( accessibility_tree ) . length } )... ` . padEnd ( 82 ) , prettyPath ( ACCESIBILITY_PATH ( page ) ) )
// console.log(outline.filter(line => line.startsWith('#')).join('\n'))
const accessibility_info = {
TYPE : 'accessibility' ,
VERSION : version ,
URL : original_url ,
iframes ,
headings : outline ,
tree : accessibility_tree ,
}
await overwriteFile (
ACCESIBILITY_PATH ( page ) ,
accessibility_info ,
)
return accessibility_info
}
async function saveSEO ( page , { original_url , version } ) {
// collect all <meta name="title" property="og:title" content="Page Title for SEO | Somesite.com"> tags into dict
const seo_vars = await page . evaluate ( ( ) = >
[ . . . document . querySelectorAll ( 'meta' ) ]
. map ( tag = > ( { key : tag.getAttribute ( 'name' ) || tag . getAttribute ( 'property' ) || '' , value : tag.getAttribute ( 'content' ) || '' } ) )
. filter ( obj = > obj . key && obj . value )
. sort ( ( a , b ) = > a . value . length - b . value . length )
. reduce ( ( acc , node ) = > { acc [ node . key ] = node . value ; return acc } , { } )
)
const seo_info = {
TYPE : 'seo' ,
VERSION : version ,
URL : original_url ,
. . . seo_vars ,
}
const num_vars = Object . keys ( seo_vars ) . length
if ( num_vars ) {
console . log ( ` [🔎] Saving page SEO metadata ( ${ num_vars } )... ` . padEnd ( 82 ) , prettyPath ( SEO_PATH ( page ) ) )
await overwriteFile ( SEO_PATH ( page ) , seo_info )
}
return seo_info
}
async function saveOutlinks ( page , { original_url , version } ) {
// TODO: slow to iterate over all elements so many times, perhaps we can collapse everything down into one loop
// Regular expression that matches syntax for a link (https://stackoverflow.com/a/3809435/117030):
const LINK_REGEX = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi ;
const filterW3Urls = ( urls ) = >
urls . filter ( url = >
url && ! url . startsWith ( 'http://www.w3.org/' ) )
const filterDataUrls = ( urls ) = >
urls . filter ( url = >
url && ! url . startsWith ( 'data:' ) )
const html = await page . content ( ) ;
const raw = html ? . match ( LINK_REGEX ) || [ ] ;
const hrefs = await page . $ $eval (
"pierce/a[href]" ,
elems = > elems
. map ( elem = > elem . href )
. filter ( url = > url ) ,
) ;
const links = await page . $ $eval (
"pierce/link[href]" ,
elems = > elems
. map ( ( { rel , href } ) = > ( { rel , href } ) )
. filter ( ( { rel , href } ) = > rel !== 'stylesheet' )
. reduce ( ( collection , entry ) = > {
const { rel , href } = entry
const non_empty_rel = collection [ href ] ? . rel || rel
collection [ href ] = { rel : non_empty_rel , href }
return collection
} , { } )
) ;
const iframes = await page . $ $eval (
"pierce/iframe[src]" ,
elems = > elems . map ( iframe = > iframe . src ) . filter ( url = > url )
) ;
const images = await page . $ $eval (
"pierce/img[src]" ,
elems = > elems . map ( img = > img . src ) . filter ( url = > url && ! url . startsWith ( 'data:' ) )
) ;
const css_images = await page . $ $eval (
"pierce/*" ,
elems = > elems
. map ( elem = > {
const css_url_ptn = /url\(\s*?['"]?\s*?(\S+?)\s*?["']?\s*?\)/i ;
const bg_img = window . getComputedStyle ( elem , null ) . getPropertyValue ( 'background-image' )
const bg_url = css_url_ptn . exec ( bg_img )
return bg_url ? bg_url [ 1 ] : null
} )
)
const css_stylesheets = await page . $ $eval (
"pierce/link[rel=stylesheet]" ,
elems = > elems . map ( elem = > elem . href ) . filter ( url = > url )
) ;
const js_scripts = await page . $ $eval (
"pierce/script[src]" ,
elems = > elems . map ( elem = > elem . src ) . filter ( url = > url )
) ;
const outlinks_info = {
TYPE : 'outlinks' ,
VERSION : version ,
URL : original_url ,
raw : [ . . . new Set ( filterDataUrls ( filterW3Urls ( raw ) ) ) ] ,
hrefs : [ . . . new Set ( filterDataUrls ( hrefs ) ) ] ,
links : [ . . . Object . values ( links ) ] ,
iframes : [ . . . new Set ( iframes ) ] ,
images : [ . . . new Set ( filterDataUrls ( images ) ) ] ,
css_images : [ . . . new Set ( filterDataUrls ( css_images ) ) ] ,
css_stylesheets : [ . . . new Set ( filterDataUrls ( css_stylesheets ) ) ] ,
js_scripts : [ . . . new Set ( filterDataUrls ( js_scripts ) ) ] ,
}
if ( raw ? . length || hrefs ? . length || links ? . length || iframes ? . length ) {
console . log ( ` [🖇️] Saving page outgoing links ( ${ raw ? . length || hrefs ? . length } )... ` . padEnd ( 82 + 1 ) , prettyPath ( OUTLINKS_PATH ( page ) ) )
await overwriteFile ( OUTLINKS_PATH ( page ) , outlinks_info )
}
return outlinks_info
}
async function saveAuthStorage ( page , { client , version , original_url } ) {
const url = original_url || await page . url ( )
if ( URL_SCHEMES_IGNORED . includes ( url . split ( ':' ) [ 0 ] ) ) return null
if ( ! SAVE_AUTH_STORAGE ) return null
// const cookies = JSON.stringify(await page.cookies()); // doesnt include httponly cookies
const auth_from_browser = {
cookies : ( await client . send ( 'Network.getAllCookies' ) ) . cookies ,
localStorage : { } ,
sessionStorage : { } ,
}
// attempt to load localStorage and sessionStorage from browser (may fail in some cases https://github.com/puppeteer/puppeteer/issues/921)
try {
auth_from_browser . localStorage = ( await page . evaluate ( ( ) = >
JSON . parse ( JSON . stringify ( { [ window . location . origin ] : window . localStorage } ) ) ) )
} catch ( err ) {
throw ` Failed to get page window.localStorage! ${ err } `
}
try {
auth_from_browser . sessionStorage = ( await page . evaluate ( ( ) = >
JSON . parse ( JSON . stringify ( { [ window . location . origin ] : window . sessionStorage } ) ) ) )
} catch ( err ) {
throw ` Failed to get page window.sessionStorage! ${ err } `
}
// WARNING: small TOCTTOU gap between this read-before-write and the write below
// can possibly overwrite changes made by other processes in this gap
const auth_on_disk = await loadAuthStorage ( page , { client } , { apply : false } )
const cookies = dedupeCookies ( [ . . . auth_on_disk . cookies , . . . auth_from_browser . cookies ] )
const auth_info = {
TYPE : 'auth' ,
VERSION : version ,
URL : original_url ,
cookies : cookies ,
sessionStorage : merge ( auth_on_disk . sessionStorage , auth_from_browser . sessionStorage ) ,
localStorage : merge ( auth_on_disk . localStorage , auth_from_browser . localStorage ) ,
}
// console.log(`[⛙] Merged ${auth_on_disk.cookies.length} existing + ${auth_from_browser.cookies.length} new -> ${auth_info.cookies.length} cookies`)
console . log ( ` [🍪] Saving cookies/localStorage/sessionStorage ( ${ auth_info . cookies . length } )... ` . padEnd ( 82 ) , prettyPath ( AUTH_JSON_PATH ) ) ;
await overwriteFile ( AUTH_JSON_PATH , auth_info ) ;
// Write to cookies.txt file using tough-cookie + @root/file-cookie-store
await saveCookiesTxt ( cookies )
return auth_info
}
async function saveCookiesTxt ( cookies ) {
const cookies_store = new FileCookieStore ( COOKIES_TXT_PATH , { auto_sync : false , lockfile : false } )
const cookie_jar = new ToughCookie . CookieJar ( cookies_store )
cookie_jar . setCookieAsync = util . promisify ( cookie_jar . setCookie )
cookies_store . saveAsync = util . promisify ( cookies_store . save )
for ( const cookie of cookies ) {
const cookie_for_tough = {
domain : cookie.domain ,
path : cookie.path ,
key : cookie.name ,
value : cookie.value ,
expires : ( new Date ( cookie . expires * 1000 ) ) . toISOString ( ) ,
hostOnly : cookie.domain.startsWith ( '.' ) ,
secure : cookie.secure ,
}
// console.log('COOKIE_FOR_TOUGH_TXT', cookie_for_tough)
const parsed_cookie = ToughCookie . Cookie . fromJSON ( cookie_for_tough )
// console.log('COOKIE_FOR_TOUGH_TXT_TO_DUMP', parsed_cookie)
try {
// assemble a fake URL just to satisfy ToughCookieJar's requirement of having a URL at set time
let url = cookie . secure ? 'https://' : 'http://'
if ( cookie . domain . startsWith ( '.' ) ) {
url = url + cookie . domain . slice ( 1 )
} else {
url = url + cookie . domain
}
if ( cookie . sourcePort && ! [ 80 , 443 ] . includes ( cookie . sourcePort ) ) {
url = ` ${ url } : ${ cookie . sourcePort } `
}
url = ` ${ url } ${ cookie . path || '' } `
await cookie_jar . setCookieAsync ( parsed_cookie , url , { ignoreError : true } )
} catch ( err ) {
console . error ( '[❌] Failed to dump browser cookie for cookies.txt...' , cookie_for_tough , '->' , parsed_cookie , err )
}
}
console . log ( ` [🍪] Saving cookies TXT ( ${ cookies . length } )... ` . padEnd ( 82 ) , prettyPath ( COOKIES_TXT_PATH ) ) ;
await cookies_store . saveAsync ( )
}
async function saveMetrics ( page , { original_url , version , start_time , start_ts , traffic_log , redirects } ) {
const end_time = ( new Date ( ) ) . toISOString ( )
const end_ts = Date . now ( )
const metrics_info = {
TYPE : 'metrics' ,
VERSION : version ,
URL : original_url ,
. . . ( await page . metrics ( ) ) ,
start_time ,
start_ts ,
end_time ,
end_ts ,
duration : ( end_ts - start_ts ) ,
num_requests : traffic_log.length ,
num_redirects : Object.keys ( redirects ) . length - 1 ,
}
console . log ( ` [🏎️] Saving final summary + timing metrics... ` . padEnd ( 82 + 1 ) , prettyPath ( METRICS_PATH ( page ) ) )
await overwriteFile ( METRICS_PATH ( page ) , metrics_info )
return metrics_info
}
/******************************************************************************/
/******************************************************************************/
/**************************** Utility Helpers *********************************/
function hashCode ( str ) {
// get a simple integer hash for a given string (based on java String#hashCode)
// useful only for throwaway nonces / easy deterministic random identifiers, not a replacement for sha256
let hash = 0 ;
for ( let i = 0 ; i < str.length ; i + + ) {
hash = str . charCodeAt ( i ) + ( ( hash << 5 ) - hash ) ;
}
return Math . abs ( hash )
}
function unique ( iter , key : string | ( ( any , number ) = > string ) = 'id' ) {
// uniqueify an array of objects by a value within them, key can be name of attr or getter function
// > iter = [{id: 1}, {id: 2}, {id: 1}]
// > Object.entries(iter) = [
// [ '0', { id: 1 } ],
// [ '1', { id: 2 } ],
// [ '2', { id: 1 } ] ]
// > unique(iter, 'id') => {1: {id: 1}, 2: {id: 2}}
// > iter = {a1: {id: 1}, b2: {id: 2}, a3: {id: 1}}
// > Object.entries(iter) = [
// [ 'a1', { id: 1 } ],
// [ 'b2', { id: 2 } ],
// [ 'a3', { id: 1 } ]
// ]
// > unique(iter, 'id') => {1: {id: 1}, 2: {id: 2}}
const key_type = ( typeof key )
if ( ! [ 'function' , 'string' ] . includes ( key_type ) )
throw 'key must be either a string lookup key or a function (obj, idx) => return unique_id'
const key_func = ( key_type === 'string' )
? ( entry_obj , idx ) = > entry_obj [ ( key as string ) ]
: ( entry_obj , idx ) = > ( key as Function ) ( entry_obj , idx ) // otherwise key is a callback func
const seen = { }
for ( const [ idx , entry_obj ] of Object . entries ( iter ) ) {
const unique_id = key_func ( entry_obj , idx )
if ( seen [ unique_id ] === undefined ) {
seen [ unique_id ] = entry_obj
}
}
return seen
}
const wait = ( ms : number ) = > new Promise ( res = > {
if ( ms > 10 _000 ) {
console . debug ( ` [⏲️] Waiting ${ Math . round ( ms / 1000 ) } s... ` )
}
setTimeout ( res , ms )
} )
const TimeoutError = Symbol ( )
const withTimeout = ( promise , ms ) = > {
// run a promise with a time limit, raises a TimeoutError if it fails
let timer
return Promise . race ( [
promise ,
new Promise ( ( _r , reject ) = >
timer = setTimeout ( reject , ms , TimeoutError )
) ,
] ) . finally ( ( ) = > clearTimeout ( timer ) )
}
const MAX_VALID_DATE = new Date ( '2150-01-01T00:00:00.000Z' )
const MIN_VALID_DATE = new Date ( '2010-01-01T00:00:00.000Z' )
const UNIX_EPOCH_DATE = new Date ( 0 )
const validateDate = ( date , { min = MIN_VALID_DATE , max = MAX_VALID_DATE , singleton = UNIX_EPOCH_DATE } = { } ) = > {
assert ( ( date instanceof Date ) , ` Got invalid type for Date: ${ typeof date } ${ date } (expected Date) ` )
assert ( String ( date ) !== 'Invalid Date' , ` Got invalid value for Date: ${ typeof date } ${ date } ` )
if ( Number ( date ) === Number ( singleton ) ) return date // epoch singleton is always valid
assert ( date < max , ` Got Date that was higher than MAX_VALID_DATE= ${ max } ` )
assert ( date > min , ` Got Date that was lower than MIN_VALID_DATE= ${ min } ` )
return date
}
const parseVersionDateStr = ( yyyymmddtime ) = > {
// YYYYMMDDhhmmssxxx or YYYYMMDDhhmmss or YYYYMMDDhhmm or YYYYMMDD -> Date
const is_only_numbers = /^\d+$/ . test ( yyyymmddtime . replace ( '.' , '' ) )
assert ( is_only_numbers , ` Non-numeric characters in YYYYMMDD date are not allowed: ${ yyyymmddtime } (while trying YYYYMMDDhhmmssxxx format) ` )
const num_digits = String ( yyyymmddtime ) . split ( '.' ) [ 0 ] . length
assert ( [ 17 , 14 , 12 , 8 ] . includes ( num_digits ) , ` Got invalid number of digits ( ${ num_digits } ) in YYYYMMDD date: ${ yyyymmddtime } (while trying YYYYMMDDhhmmssxxx format) ` )
const [ _all , yyyy , mm , dd , hr , min , sec , ms ] = /^(\d{4})(\d{2})(\d{2})(\d{2})?(\d{2})?(\d{2})?(\d{3})?$/ . exec ( yyyymmddtime )
assert ( yyyy && mm && dd , ` Could not find YYYYMMDD ` )
const time_error_msg = ` Detected YYYYMMDD[hhmm[ss[xxxx]]] but time segment is invalid ${ hr } : ${ min || '__' } : ${ ms || '___' } `
if ( ms ) assert ( hr && min && sec , time_error_msg )
if ( sec ) assert ( hr && min , time_error_msg )
if ( min ) assert ( hr , time_error_msg )
if ( hr ) assert ( min , time_error_msg )
const iso_str = ` ${ yyyy } - ${ mm } - ${ dd } T ${ hr || '00' } : ${ min || '00' } : ${ sec || '00' } . ${ ms || '00' } Z `
const parsed_date = new Date ( iso_str )
return validateDate ( parsed_date ) // 1970-01-01T00:00:00.000Z (ISO format)
}
const parseTimestampDateStr = ( timestamp ) = > {
// 1709724291000 or 1709724291000.000 or 1709724291 or 1709724291.000 -> Date
timestamp = String ( timestamp )
const is_only_numbers = /^\d+$/ . test ( timestamp . replace ( '.' , '' ) )
assert ( is_only_numbers , ` Got invalid characters in timstamp: ${ timestamp } (while trying xxxxxxxxxxxxx format) ` )
const num_digits = String ( timestamp ) . split ( '.' ) [ 0 ] . length
assert ( [ 13 , 10 , 1 ] . includes ( num_digits ) , ` Got invalid number of digits ( ${ num_digits } ) in timestamp: ${ timestamp } (while trying xxxxxxxxxxxxx format) ` )
let parsed_date = null
if ( num_digits === 13 ) {
parsed_date = new Date ( Number ( timestamp ) ) // 1709724291000 (unix timestamp w/ milliseconds)
} else if ( num_digits === 10 ) {
parsed_date = new Date ( Number ( timestamp ) * 1000 ) // 1709724291 (unix timestamp w/ seconds)
} else if ( num_digits === 1 ) {
assert ( String ( timestamp ) === '0' , ` Got invalid single-digit timestamp: ${ timestamp } (while trying xxxxxxxxxxxxx format or 0 for UNIX epoch) ` )
parsed_date = UNIX_EPOCH_DATE
}
return validateDate ( parsed_date )
}
const parseISODateStr = ( iso_str ) = > {
// 1970-01-01T00:00:00.000Z -> Date
const num_digits = String ( iso_str ) . length
assert ( [ 24 , 19 , 16 , 10 ] . includes ( num_digits ) , ` Got invalid number of digits ( ${ num_digits } ) in ISO date: ${ iso_str } (while trying 1970-01-01T00:00:00.000Z format) ` )
const parsed_date = new Date ( iso_str )
return validateDate ( parsed_date )
}
const parseDate = ( date ) = > {
// date === undefined => use today/now
// date === null => use unix epoch 0 aka 1970-01-01T00:00:00.000Z
// date *= YYYYMMDDHHMMSS => use a version date string (e.g. 20010131235958)
// date *= 1234567... => use a timestmap (e.g. 1709724291000)
// date *= 1970-01-01T... => use iso datetime (e.g. 1970-01-01T00:00:00.000Z)
// returns -> Date
if ( date === undefined ) {
return ( new Date ( ) ) // today (2024-05-29T22:02:34.682Z) aka timestamp=1717020154682
}
if ( date === null || date == 0 ) {
return UNIX_EPOCH_DATE // unix epoch (1970-01-01T00:00:00.000Z) aka timestamp=0
}
if ( date instanceof Date ) {
return validateDate ( date ) // JS date Date('1970-01-01T00:00:00.000Z')
}
if ( ( typeof date ) === 'number' ) {
date = String ( date ) // unix timestamp e.g. 1717020154682
}
assert ( ( typeof date ) === 'string' , ` Tried to parse date but got unsupported type ${ ( typeof date ) } : ${ date } ` )
const errors = [ ` Failed to parse Date from string: ${ date } ` ]
try {
return parseVersionDateStr ( date )
} catch ( err ) { errors . push ( err ) }
try {
return parseTimestampDateStr ( date )
} catch ( err ) { errors . push ( err ) }
try {
return parseISODateStr ( date )
} catch ( err ) { errors . push ( err ) }
throw errors . join ( '\n' )
}
const versionStrFromDate = ( date , { withDate = true , withTime = true , withSeconds = true , withMilliseconds = false } = { } ) = > {
// takes Date, returns YYYYMMDDHHMMSSXXX or YYYYMMDDHHMMSS or YYYYMMDDHHMM or YYYYMMDD
const parsed_date = parseDate ( date )
const [ date_iso , time_iso ] = parsed_date . toISOString ( ) . split ( 'T' ) // ['2001-01-31', '23:59:58.090Z']
const components_to_use = [ ]
if ( withDate ) {
components_to_use . push ( date_iso . replaceAll ( '-' , '' ) ) // '20010131'
}
if ( withTime ) {
const [ hr , min , sec , ms ] = time_iso . replace ( 'Z' , '' ) . replace ( '.' , ':' ) . split ( ':' ) // ['23', '59', '58', '090']
components_to_use . push ( hr )
components_to_use . push ( min )
if ( withSeconds ) {
components_to_use . push ( sec )
if ( withMilliseconds ) {
components_to_use . push ( ms )
}
}
}
assert ( components_to_use . length , 'At least one of {withDate, withTime} must be set.' )
const final_str = components_to_use . join ( '' ) // 20010131235958
assert ( parseVersionDateStr ( final_str ) ) // sanity check to make sure it parses correctly
return final_str
}
// test date functions:
// console.log(parseDate('20120131'))
// console.log(versionStrFromDate(parseDate('20120131')))
// console.log(versionStrFromDate(parseDate('0')))
// console.log(versionStrFromDate(parseDate(0)))
// console.log(versionStrFromDate(parseDate(null)))
// console.log(versionStrFromDate())
// console.log(versionStrFromDate(parseDate('20120131235859090')))
// console.log(versionStrFromDate(parseDate('1970-01-01T00:00:00.000Z')))
// console.log(versionStrFromDate(parseDate('2024-12-01T00:00')))
// console.log(versionStrFromDate(parseDate('2024-12-01'), {withTime: false}))
const prettyPath = ( path ) = > {
// return a pretty-printable path where the abspath of the data dir is replaced with /data for brevity/privacy
return path . replace ( DATA_DIR , './data' )
}
const pathIsHidden = ( relpath ) = > {
// check if a path or any of the directories above it are hidden (e.g. ./some/.dir/abc or ./.DS_Store)
// make sure test path behaves like an abspath (avoids edge-cases messing up relpaths on '' or '.' or './')
let test_path = relpath
if ( test_path . startsWith ( './' ) )
test_path = test_path . substring ( 2 )
if ( ! test_path . startsWith ( '/' ) )
test_path = path . join ( '/' , test_path )
// iterate through parents, checking if any parent is hidden until we reach /
while ( test_path !== '/' ) {
const basename = path . basename ( test_path )
if ( basename . startsWith ( '.' ) ) {
// console.log('PATH IS HIDDEN', relpath)
return true
}
// otherwise set test_path to parent dir and repeat
test_path = path . dirname ( test_path )
}
return false
}
const pathDepth = ( child_path , relative_to = '.' ) = > {
// get the number of directory hops deep a child path is relative to '.' (or a given parent)
if ( child_path . startsWith ( '/' ) && ! relative_to . startsWith ( '/' ) ) {
// if child_path is absolute, then relative_to must be absolute as well otherwise depth will be depth all the way to the / root
relative_to = fs . realpathSync ( relative_to )
}
if ( relative_to . startsWith ( '/' ) && ! child_path . startsWith ( '/' ) ) {
// same deal, either both paths have to be relative, or both have to be absolute
child_path = fs . realpathSync ( child_path )
}
const relative_path_to_root = path . relative ( relative_to , child_path )
const num_hops_down = relative_path_to_root . split ( '/' ) . length
return num_hops_down
}
interface DirentWithExtras extends fs . Dirent {
relpath : string ,
abspath : string ,
reldepth : number ,
}
async function getDirEntries ( dir_path , { pwd = null , recursive = true , includeHidden = false , includeFiles = true , includeDirs = true , includeLinks = false , filter = null , maxdepth = - 1 } = { } ) {
// get the list of all sub-paths under a given path recursively
// console.log('GETTING DIRECTORY ENTRIES', {dir_path, pwd, recursive, includeHidden, includeFiles, includeDirs, maxdepth})
pwd = pwd || dir_path
let dir_abspath = dir_path
if ( ! dir_abspath . startsWith ( pwd ) ) {
dir_abspath = path . join ( pwd , dir_abspath )
}
assert ( fs . existsSync ( dir_abspath ) , ` Tried to get directory listing for dir that doesn't exist! ${ prettyPath ( dir_abspath ) } ` )
return ( await fs . promises . readdir ( dir_abspath , { recursive , withFileTypes : true } ) )
. map ( ( dirent : DirentWithExtras ) = > {
// filter combined with map because relpath is re-used in both operations
const relpath = path . join ( path . relative ( pwd , dirent . parentPath ) , dirent . name )
// console.log('CALCULATED RELATIVE PATH', relpath)
const abspath = path . join ( dir_abspath , relpath )
const basename = path . basename ( dirent . name )
if ( ! includeLinks && dirent . isSymbolicLink ( ) ) return null
if ( ! includeFiles && dirent . isFile ( ) ) return null
if ( ! includeDirs && dirent . isDirectory ( ) ) return null
if ( ! includeHidden && pathIsHidden ( relpath ) ) return null
dirent . relpath = relpath
dirent . abspath = abspath
dirent . reldepth = pathDepth ( relpath )
// console.log('RELATIVE DEPTH MEASURED', prettyPath(dir_abspath), prettyPath(relpath), dirent.reldepth)
if ( maxdepth >= 0 ) {
if ( ( dirent . reldepth - 1 ) > maxdepth ) return null
}
if ( ( typeof filter ) === 'function' ) {
const should_keep = filter ( { abspath , relpath , basename , dirent } )
if ( ! should_keep ) {
// console.log('FILTER EXCLUDED RESULT', {abspath, relpath, basename, dirent})
return null
}
}
return relpath
} )
. filter ( Boolean )
. sort ( ) as string [ ]
}
async function getTotalSize ( dir_or_file_path , { pwd = null , _cache = null , filter = null , subfiles = null } = { } ) {
// get the total size in bytes of a file or directory (recursively adds up file sizes within directory)
// check _cache first
if ( _cache && ( dir_or_file_path in _cache ) )
return _cache [ dir_or_file_path ]
// make sure dir_or_file_path is under pwd
pwd = pwd || path . dirname ( dir_or_file_path )
let abspath = dir_or_file_path
if ( ! dir_or_file_path . startsWith ( pwd ) ) {
abspath = path . join ( pwd , dir_or_file_path )
}
// if it's a file, stat it and return the size
// console.log('CALCUALTED ABSPATH', {abspath, dir_or_file_path, pwd})
const dirent = await fs . promises . stat ( abspath )
if ( dirent . isFile ( ) ) {
// console.log('CALCULATING FILE SIZE subfile=', prettyPath(abspath))
return dirent . size
}
// if it's not a file and not a directory, give up, dont try to size special files like FIFO/socket/etc.
if ( ! dirent . isDirectory ( ) ) return 0
// if it's a directory, size is the sum of all the sizes of files within
// console.log('CALCULATING SUBDIR SIZE subdir=', prettyPath(abspath))
let total_bytes = 0
const files_within = subfiles || await getDirEntries ( dir_or_file_path , {
pwd ,
recursive : true ,
includeDirs : false ,
includeFiles : true ,
filter ,
} )
for ( const subpath of files_within ) {
total_bytes += await getTotalSize ( subpath , { pwd , _cache , filter } )
}
return total_bytes
}
async function getDirSizes ( dir_path , { pwd = null , subfiles = null , withRoot = true , filter = null , maxdepth = - 1 } = { } ) {
// get the size of a directory and all the files within (recursively) as a number of bytes
// dir_path: path absolute or relative path of the directory you want size info for
// pwd: path (optional) absolute path of the directory you want to interpret dir_path relative to
// subfiles: dirent[] (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
// withRoot: bool include a summary entry for the root dir_path dir in the list as '.'
// filter: function (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
// maxdepth: number (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
assert ( ( await fs . promises . stat ( dir_path ) ) . isDirectory ( ) , ` Tried to calculate directory sizes but path is not a directory! ${ dir_path } ` )
pwd = pwd || dir_path
// {'.': 246, 'example.json': 123, 'example2.txt': 123}
const sizes = { }
// first collect the list of all sub-files recursively and calculate their sizes individually
const files_within = subfiles || await getDirEntries ( dir_path , {
pwd ,
recursive : true ,
includeDirs : false ,
includeFiles : true ,
// dont pass maxdepth here, we need the entire file listing to accurately calculate parent dir sizes
// it never makes sense to ignore subfiles beyond a certain depth for size calculation
filter , // filter is allowed though, useful to calculcate size of some subset of files that match a pattern
} )
for ( const subpath of files_within ) {
sizes [ subpath ] = await getTotalSize ( subpath , { pwd , _cache : sizes , filter } )
}
// then calculate the top-level directory total as the sum of all the file sizes under it
const total_size = Object . values ( sizes ) . reduce ( ( a : number , b : number ) = > a + b , 0 )
// then calculate the subtotals of all the sub-directories
const subdirs_within = await getDirEntries ( dir_path , { pwd , recursive : true , includeDirs : true , includeFiles : false , filter , maxdepth } )
for ( const subpath of subdirs_within ) {
sizes [ subpath ] = await getTotalSize ( subpath , { pwd , _cache : sizes , filter } ) // uses _cache to avoid re-computing
}
// if maxdepth is passed, filter results to only include paths shallower than max depth
if ( maxdepth >= 0 ) {
for ( const subpath of Object . keys ( sizes ) ) {
if ( pathDepth ( subpath ) > maxdepth ) {
delete sizes [ subpath ]
}
}
}
// set total_size last so it appears at the bottom of the object in logs for convenience
if ( withRoot ) {
sizes [ '.' ] = total_size
}
return sizes
}
async function getLargestPath ( path_a , path_b ) {
// compare two files/directories and return the largest one of the two (calculating size recursively)
path_a = await fs . promises . realpath ( path_a )
path_b = await fs . promises . realpath ( path_b )
const size_a = await getTotalSize ( path_a )
const size_b = await getTotalSize ( path_b )
// console.log('COMPARING', prettyPath(path_a), size_a, ' ', prettyPath(path_b), size_b)
if ( size_a > size_b ) return path_a
return path_b
}
async function findCommonAncestor ( target_abspath , symlink_abspath , { relative = true , search_limit = DATA_DIR } : { relative? : boolean | string , search_limit? : string } = { } ) {
// given a target path and a symlink path, find the common ancestor path they both share
// (searches recursively through absolute path parent directories until a common dir is found, up to search_limit)
search_limit = await fs . promises . realpath ( search_limit )
let relative_dir = search_limit
if ( ( typeof relative ) === 'boolean' ) {
// if start dir is default, set it to symlinks directory path
if ( relative ) {
relative_dir = path . dirname ( symlink_abspath )
} else {
relative_dir = search_limit
}
} else if ( ( typeof relative ) === 'string' ) {
// if start dir is a string, get its absolute path
relative_dir = relative as string
} else {
throw ` Got invalid type for relative path during common ancestor search: ${ relative } `
}
if ( ( await fs . promises . stat ( relative_dir ) ) . isFile ( ) ) {
// if start dir is a file, set it to its parent dir path
relative_dir = path . dirname ( relative_dir )
}
assert (
( await fs . promises . stat ( relative_dir ) ) . isDirectory ( ) ,
` Tried to find common ancestor starting from invalid search directory: \ n 🔗 ${ prettyPath ( symlink_abspath ) } \ n -> ${ prettyPath ( target_abspath ) } \ n Error: search dir does not exist or is not a directory: ❌ ${ prettyPath ( relative_dir ) } ` ,
)
const symlink_filename = path . basename ( symlink_abspath )
const target_filename = path . basename ( target_abspath )
const symlink_parent_abspath = await fs . promises . realpath ( path . dirname ( symlink_abspath ) )
const target_parent_abspath = await fs . promises . realpath ( path . dirname ( target_abspath ) )
const search_dir_abspath = await fs . promises . realpath ( relative_dir )
let closest_common_ancestor = search_dir_abspath
const isAncestorCommon = ( ancestor ) = > (
target_parent_abspath . startsWith ( ancestor )
&& symlink_parent_abspath . startsWith ( ancestor ) )
// check if both src and target start with the same ancestor path
while ( closest_common_ancestor !== search_limit ) {
if ( isAncestorCommon ( closest_common_ancestor ) ) break
else {
// otherwise go up one directory and try again
// console.log(' ...going up a directory', prettyPath(closest_common_ancestor)+'/..')
closest_common_ancestor = path . dirname ( closest_common_ancestor )
}
}
assert (
isAncestorCommon ( closest_common_ancestor ) ,
` Tried to create relative symlink but could not find common ancestor: \ n 🔗 ${ prettyPath ( symlink_abspath ) } \ n -> ${ prettyPath ( target_abspath ) } \ n Error: target path and symlink path are not both under: \ n ❌ ${ prettyPath ( closest_common_ancestor ) } ` ,
)
const symlink_to_ancestor_relpath = path . relative ( symlink_parent_abspath , closest_common_ancestor ) // ../../..
const target_from_ancestor_relpath = path . join ( path . relative ( closest_common_ancestor , target_parent_abspath ) , target_filename ) // 'archive/19999999.23423523'
const symlink_to_target_relpath = path . join ( symlink_to_ancestor_relpath , target_from_ancestor_relpath ) // '../../../archive/19999999.23423523'
return {
closest_common_ancestor ,
search_dir_abspath ,
target_abspath ,
target_filename ,
target_from_ancestor_relpath ,
symlink_abspath ,
symlink_filename ,
symlink_to_ancestor_relpath ,
symlink_to_target_relpath ,
}
}
interface StatsWithExtras extends fs . Stats {
abspath : string
relpath? : string
reldepth? : number
}
async function blockUntilExists ( file_path , { timeout = 7 _500 , min_bytes = 0 } = { } ) {
// wait up to timeout seconds until file we expect to exist appears on the filesystem
// (used to handle eventual consistency in network filesystems where we need a delay after writing before reads show up)
const interval = 250
const max_tries = timeout / interval
let tries = 0
let abspath = null
while ( tries < max_tries ) {
try {
const abspath = await fs . promises . realpath ( file_path )
assert ( fs . existsSync ( abspath ) )
const dirent = await fs . promises . stat ( abspath ) as StatsWithExtras
dirent . abspath = abspath
if ( min_bytes && ( dirent . size < min_bytes ) ) {
assert ( dirent . size >= 1 )
// this is a valid warning but unfortunately its too common to bother showing:
// console.warn(`[⚠️] Expected file to be >=${Math.round(min_bytes/1000)}kb but was only ${dirent.size/1000}kb:`, prettyPath(file_path))
}
return dirent
} catch ( err ) {
const waited = ( tries * interval )
if ( waited === 5 _000 ) {
console . warn ( ` [⚠️] Waited > ${ waited / 1000 } s for file to appear (is filesystem or bg task running slow?): ` , prettyPath ( file_path ) )
}
await wait ( interval )
tries ++
}
}
throw ` Expected file does not exist after ${ timeout / 1000 } s: ${ prettyPath ( file_path ) } `
}
async function overwriteSymlink ( target_path , symlink_path , { relative = true , mkdirs = false , search_limit = DATA_DIR , timeout = 5 _000 } : { relative? : boolean | string , mkdirs? : boolean , search_limit? : string , timeout? : number } = { } ) {
// create a symlink from symlink_path -> target_path
// relative: true => symlink is created as a relative link by default (it will auto-find the closest common ancestor dir, often DATA_DIR)
// mkdirs: true => optionally creates symlink parent dirs automatically)
// make sure target file actually exists first
let target_dirent
try {
target_dirent = await blockUntilExists ( target_path , { timeout } )
} catch ( err ) {
throw ` Tried to create symlink pointing to file that does not exist: \ n 🔗 ${ prettyPath ( symlink_path ) } \ n -> ❌ ${ prettyPath ( target_path ) } \ n ${ err } `
}
const target_abspath = target_dirent . abspath
const target_filename = path . basename ( target_abspath )
const target_parent_abspath = path . dirname ( target_abspath )
// make sure target is a valid file or directory and not a special character/block device/other weird file
const target_is_dir = target_dirent . isDirectory ( )
const target_is_file = target_dirent . isFile ( )
assert ( target_is_dir || target_is_file , ` Tried to create symlink to an unsupported file type: \ n 🔗 ${ prettyPath ( symlink_path ) } \ n -> ❌ ${ prettyPath ( target_path ) } (expected file or directory) ` )
// create symlink file parent directories if needed
const symlink_filename = path . basename ( symlink_path )
const symlink_parent_dir = path . dirname ( symlink_path )
if ( mkdirs ) {
await fs . promises . mkdir ( symlink_parent_dir , { recursive : true } )
}
try {
assert ( ( await fs . promises . stat ( symlink_parent_dir ) ) . isDirectory ( ) )
} catch ( err ) {
throw ` Tried to create symlink in a directory that doesn't exist: \ n 🔗 ${ symlink_parent_dir } ❌/ ${ symlink_filename } \ n -> ${ target_path } \ n ${ err } `
}
const symlink_parent_abspath = await fs . promises . realpath ( symlink_parent_dir )
const symlink_abspath = path . join ( symlink_parent_abspath , symlink_filename )
// determine nearest common ancestor between symlink dir and target dir
const {
closest_common_ancestor ,
symlink_to_ancestor_relpath ,
target_from_ancestor_relpath ,
symlink_to_target_relpath ,
} = await findCommonAncestor ( target_abspath , symlink_abspath , { relative , search_limit } )
// set final target path to abspath or relative path depending on {relative} options
let target_path_final
if ( relative ) {
// make symlink into relative link (based on closest common ancestor dir between symlink_abspath and target_abspath)
target_path_final = symlink_to_target_relpath
// console.log(' 🔗', prettyPath(symlink_abspath), '->', prettyPath(target_abspath), `(as relative link: ${target_path_final})`)
} else {
// make symlink into an absolute path (verbatim passed target_path)
target_path_final = target_path
// console.log(' 🔗', prettyPath(symlink_abspath), '->', prettyPath(target_abspath), '(as absolute path)')
}
// remove any existing symlink at destination if there is already one there
const random_nonce = crypto . randomBytes ( 16 ) . toString ( 'hex' ) . substring ( 0 , 8 )
const symlink_temp_path = ` ${ symlink_abspath } . ${ random_nonce } .dup `
try { await fs . promises . unlink ( symlink_abspath ) } catch ( err ) { }
try { await fs . promises . unlink ( symlink_temp_path ) } catch ( err ) { }
// create the symlink and check that it works after creation
let created_symlink = null
try {
created_symlink = symlink_temp_path
await fs . promises . symlink ( target_path_final , symlink_temp_path )
created_symlink = symlink_abspath
await fs . promises . rename ( symlink_temp_path , symlink_abspath )
} catch ( err ) {
if ( String ( err ) . includes ( 'EISDIR' ) ) {
// console.warn('[⚠️] Tried to create symlink on top of existing directory', prettyPath(symlink_abspath))
// no real recourse in this situation, and its too noisy to log every time this happens
// it's also not always safe to move the dir out of the way, so better to just fail silently here, leaving:
// ${symlink_abspath}.${random_nonce}.dup
} else {
console . warn ( '[⚠️] Failed to create symlink' , prettyPath ( created_symlink ) , err )
}
}
let dirent
try {
dirent = await blockUntilExists ( created_symlink , { timeout , min_bytes : 0 } )
// best we can do here is just check that it exists ^, trying to check that it has the exact expected abspath that we set is bad, because its a race condition:
// assert(dirent.abspath == target_abspath) // its often already overwritten by later activity, so final abspath may already be different
} catch ( err ) {
throw ` Symlink created but does not seem to resolve to intended file: \ n 🔗 ${ symlink_path } \ n -> ❌ ${ target_path } \ n actual= ${ dirent ? . abspath } \ n expected= ${ target_abspath } \ n ${ err } `
}
return {
symlink_path ,
symlink_abspath : created_symlink ,
symlink_filename : path.basename ( created_symlink ) ,
symlink_parent_abspath ,
symlink_to_ancestor_relpath ,
symlink_to_target_relpath ,
target_path ,
target_abspath ,
target_filename ,
target_parent_abspath ,
target_from_ancestor_relpath ,
target_path_final ,
target_is_dir ,
target_is_file ,
target_is_relative : Boolean ( relative ) ,
closest_common_ancestor ,
}
}
// test symlink and common ancestor finding
// console.log(await findCommonAncestor('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269/seo.json', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269/seo2.json'))
// console.log(await findCommonAncestor('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269', {relative: true, search_limit: '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/'}))
// console.log(await overwriteSymlink('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269'))
// console.log(await overwriteSymlink('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/favorite_snapshots/1709724410.19269', {relative: false, mkdirs: true, search_limit: '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/'}))
async function overwriteDir ( path ) {
// delete any existing folder at the destination path (important otherwise we may create a folder inside an existing folder/symlink)
try {
await fs . promises . rm ( path , { recursive : true , force : true } ) ;
} catch ( err ) { }
await fs . promises . mkdir ( path , { recursive : true } )
return path
}
async function overwriteFile ( path , contents , options = { encoding : 'utf8' , flag : 'w' , flush : false , block : true } ) {
// write any JS value to a fresh file (e.g. String, Buffer, WritableStream, etc. anything JSON-serializable)
const block_until_created = options . block || true
delete options . block
try {
// delete any existing symlink/file present at the destination path
// (important otherwise we may write into an existing symlink by accident)
await fs . promises . unlink ( path )
} catch ( err ) { }
try {
let nonce = 1
while ( ( await fs . promises . stat ( path ) ) . isDirectory ( ) ) {
// if we try to write a file to a path that already has a directory in that location
// (common when trying to write response JSON e.g. http://www.instagram.com/api/graphql returns json and www.instagram.com/api/graphql/abc returns json)
path = path . replace ( ` . ${ nonce - 1 } ` , '' ) + ` . ${ nonce } `
nonce ++ ;
if ( nonce > 20 ) throw ` Too many conflicting files while trying to write to ${ prettyPath ( path ) } `
}
} catch ( err ) {
if ( ! String ( err ) . includes ( 'no such file or directory' ) ) {
console . warn ( '[⚠️] Warning: Problem with conflicting directory at while trying to write file' , err )
}
}
// refuse writing undefined/null/function because its likely an error and not intended
const content_is_null = ( contents === null ) || ( contents === undefined )
const content_is_func = ( typeof contents === 'function' )
if ( content_is_null || content_is_func ) {
throw ` Cannot write ${ typeof contents } ${ contents } to file: ${ path } `
}
// Numbers, BigInts, and Booleans can be cast to strings, then wrt
const content_is_primitive = [ 'number' , 'bigint' , 'boolean' ] . includes ( typeof contents )
if ( content_is_primitive ) {
contents = String ( contents )
await fs . promises . writeFile ( path , contents , options as any )
if ( block_until_created ) await blockUntilExists ( path , { min_bytes : Buffer.byteLength ( contents ) } )
return path
}
// Strings and Buffers can be written directly to file
const content_is_string = ( typeof contents === 'string' || contents instanceof String )
const content_is_buffer = Buffer . isBuffer ( contents )
if ( content_is_string || content_is_buffer ) {
await fs . promises . writeFile ( path , contents , options as any )
if ( block_until_created ) await blockUntilExists ( path , { min_bytes : Buffer.byteLength ( contents ) } )
return path
}
// WritableStream objects can be piped into file
const content_is_stream = ( contents ? . pipe )
if ( content_is_stream ) {
const stream_byte_length = contents . writableLength
const dest_file = fs . createWriteStream ( path ) ;
await finished ( contents . pipe ( dest_file ) )
if ( block_until_created ) await blockUntilExists ( path , { min_bytes : stream_byte_length } )
return path
}
// Objects and Arrays can be JSON-stringified then written into file
const content_is_obj = ( Array . isArray ( contents ) || typeof contents === 'object' )
if ( content_is_obj ) {
contents = JSON . stringify ( contents , null , 4 )
await fs . promises . writeFile ( path , contents , options as any )
if ( block_until_created ) await blockUntilExists ( path , { min_bytes : Buffer.byteLength ( contents ) } )
return path
}
throw ` Cannot write contents of type ${ typeof contents } to file: ${ path } < ${ contents } `
}
async function saveExecResult ( bin , args = null , { original_url , version } , { cwd = '.' , timeout = 60 _000 , . . . spawn_options } = { } ) {
assert ( bin )
assert ( original_url && original_url . includes ( '://' ) )
assert ( version )
const BIN_NAME = bin // 'yt-dlp'
const ARGS = args || [ ] // ['--some-arg', '--some-other-arg']
const CWD = cwd || process . cwd ( ) // '.'
const TIMEOUT = 300 _000 // 5min timeout
const PATH = process . env . PATH
await fs . promises . mkdir ( cwd , { recursive : true } )
// quick-n-dirty dump of cmd to bash script, but this might be better: https://github.com/nodejs/node/issues/34840#issuecomment-677402567
const cmd_log_str = ` #!/usr/bin/env bash
TYPE = "${BIN_NAME}"
URL = "${original_url}"
VERSION = "${version}"
TIMEOUT = $ { TIMEOUT }
CWD = "${CWD}"
PATH = "${PATH}:$PATH"
$ { BIN_NAME } $ { ARGS . map ( arg = > JSON . stringify ( arg ) ) . join ( ' ' ) }
`
const cmd_log = path . join ( cwd , 'cmd.sh' )
await overwriteFile ( cmd_log , cmd_log_str )
const stdout_log = fs . createWriteStream ( path . join ( cwd , 'stdout.log' ) )
const stderr_log = fs . createWriteStream ( path . join ( cwd , 'stderr.log' ) )
const start_date = new Date ( )
const start_ts = Number ( start_date )
const start_time = start_date . toISOString ( )
const child = child_process . spawn (
BIN_NAME ,
ARGS ,
{
cwd : CWD ,
timeout : TIMEOUT , // 5min timeout
stdio : [ null , 'pipe' , 'pipe' ] , // </dev/null >./stdout.log 2>./stderr.log
// detached: true, // run in background, don't block on response
. . . ( spawn_options || { } ) ,
} ,
)
child . stdout . setEncoding ( 'utf8' )
child . stdout . pipe ( stdout_log )
child . stderr . setEncoding ( 'utf8' )
child . stderr . pipe ( stderr_log )
const exec_info = {
TYPE : BIN_NAME ,
URL : original_url ,
VERSION : version ,
bin_name : BIN_NAME ,
args : ARGS ,
timeout : TIMEOUT ,
hostname : os.hostname ( ) ,
bin_paths : PATH ,
ppid : process.pid ,
pid : child.pid ,
start_ts ,
start_time ,
end_time : null ,
end_ts : null ,
duration : null ,
returncode : null ,
log_files : { } ,
output_files : { } ,
}
// promise that resolves when the command is finished executing
// TODO: refactor to use withTimeout
const getResult = ( timeout = TIMEOUT ) = >
new Promise ( ( resolve , reject ) = > {
const loop = setInterval ( ( ) = > {
if ( exec_info . end_time ) {
clearInterval ( loop )
clearTimeout ( timer )
resolve ( exec_info )
}
} , 100 )
const timer = setTimeout ( ( ) = > {
clearInterval ( loop )
if ( ! exec_info . end_time ) {
reject ( new Error ( ` Process ${ BIN_NAME } did not finish within TIMEOUT= ${ TIMEOUT } ` ) )
}
} , timeout ) ;
} )
const logFilesFilter = ( { relpath } ) = >
[ 'cmd.sh' , 'stdout.log' , 'stderr.log' ] . includes ( relpath )
const outputFilesFilter = ( { relpath } ) = >
! [ 'cmd.sh' , 'stdout.log' , 'stderr.log' , 'index.json' ] . includes ( relpath )
const getOutputFiles = async ( filter = outputFilesFilter ) = > {
return await getDirInfo ( CWD , { filter , withHelpers : false , withRoot : false , maxdepth : 6 } )
}
child . on ( 'close' , async ( returncode ) = > {
const end_date = new Date ( )
exec_info . returncode = returncode
exec_info . pid = child . pid
exec_info . end_ts = Number ( end_date )
exec_info . end_time = end_date . toISOString ( )
exec_info . duration = exec_info . end_ts - exec_info . start_ts
exec_info . log_files = await getOutputFiles ( logFilesFilter )
exec_info . output_files = await getOutputFiles ( outputFilesFilter )
const end_metadata = `
# END_TIME = "${exec_info.end_time}"
# DURATION = $ { exec_info . duration }
# RETURNCODE = $ { exec_info . returncode }
`
await fs . promises . appendFile ( cmd_log , end_metadata )
// write exec_info json (which includes file list) to CWD/index.json
await overwriteFile ( path . join ( CWD , 'index.json' ) , exec_info )
} )
// child.unref() // dont wait for child process to close
const start_metadata = `
# # # # # # # # # # # # # # # # # # # # LAST RUN LOG # # # # # # # # # # # # # # # # # # # #
# HOSTNAME = "${exec_info.hostname}"
# PPID = $ { exec_info . ppid }
# PID = $ { exec_info . pid }
# START_TIME = "${exec_info.start_time}"
`
await fs . promises . appendFile ( cmd_log , start_metadata )
return {
. . . exec_info ,
getResult ,
}
}
const HASH_CACHE = { }
async function sha256File ( file_path : string , { pwd = null } : { pwd? : string } = { } ) {
return new Promise ( ( resolve , reject ) = > {
pwd = pwd || path . dirname ( file_path ) ;
if ( ! file_path . startsWith ( pwd ) ) {
file_path = path . join ( pwd , file_path ) ;
}
const dirent = fs . statSync ( file_path ) ;
const abspath = fs . realpathSync ( file_path ) ;
const cache_key = ` ${ abspath } : ${ dirent . size } : ${ dirent . mtimeMs } ` ; // PATH:SIZE:LAST_MODIFIED_TIME
if ( cache_key in HASH_CACHE ) {
resolve ( HASH_CACHE [ cache_key ] ) ;
}
const hash = crypto . createHash ( 'sha256' ) ;
const rs = fs . createReadStream ( abspath ) ;
rs . on ( 'error' , reject ) ;
rs . on ( 'data' , chunk = > hash . update ( chunk ) ) ;
rs . on ( 'end' , ( ) = > {
const final_hash = hash . digest ( 'hex' ) ;
HASH_CACHE [ cache_key ] = final_hash ;
resolve ( final_hash ) ;
} ) ;
} ) as Promise < string >
}
async function getDirSha256 ( dir_path , { pwd = null , withRoot = true , filter = null , maxdepth = - 1 , subfiles = null } = { } ) {
// console.log('CALCULATING SHA256 OF FILES IN DIR', dir_path, {withRoot, filter, maxdepth})
// dir_path: path absolute or relative path of the directory you want the merkle sha256 for
// pwd: path (optional) absolute path of the directory you want to interpret dir_path relative to
// withRoot: bool include a summary entry for the root dir_path dir in the list as '.'
// filter: function (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
// maxdepth: number (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
// subfiles: dirent[] (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
pwd = pwd || dir_path
if ( ! dir_path . startsWith ( pwd ) ) {
dir_path = path . join ( pwd , dir_path )
}
const dirent = await fs . promises . stat ( dir_path )
assert ( dirent . isDirectory ( ) , ` Tried to compute sha256 of path but missing or not a directory! ${ dir_path } ` )
assert ( ( maxdepth >= - 1 ) , ` maxdepth must be -1, 0, or 1, 2, 3, etc... (got ${ maxdepth } ) ` )
// assert(!(filter && withRoot), `Cannot generate root hash (consistently) when a custom filter is provided!`)
// get the sha256 of every file in a directory recursively (excluding hidden files and symlinks)
// EQUIVALENT TO: find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum
const all_subfiles = ( subfiles as string [ ] ) || await getDirEntries ( dir_path , {
pwd ,
recursive : true ,
includeFiles : true ,
includeDirs : false ,
// ~~maxdepth,~~ // dont pass maxdepth here, we need the entire file listing to accurately calculate parent dir hashes.
// it never makes sense to ignore subfiles beyond a certain depth for hash calculation. Hashes are
// only useful IDs if they are consistent+repeatable, hashing to an arbitrary depth will produce
// many different hashes for the same directory, which is not something we need/want polluting the hash space.
filter , // we do however allow passing a manual filter funcs which does actually affect the hash
// this is useful to allow quick checks to see whether a certain subset of files has changed or not
} )
const hashes : { [ key : string ] : string } = { }
let hashable_summary_str = ''
for ( const subfile of all_subfiles ) {
// {'versions/20240413144307/screen recording.mp4': '1df4d9c3aca8b36f1f73e327d56038f80a35db407a298edb16c72576d7dd894e', ...}
hashes [ subfile ] = await sha256File ( subfile , { pwd } )
const relpath = path . relative ( await fs . promises . realpath ( dir_path ) , await fs . promises . realpath ( path . join ( pwd , subfile ) ) )
hashable_summary_str += ` ${ hashes [ subfile ] } ./ ${ relpath } \ n `
}
// console.log('CALCULATED HASHES FOR ALL SUBFILES IN DIR', dir_path, Object.keys(hashes).length)
// get list of subdirectories and recursively hash every subdirectory
// EQUIVALENT TO: find . -type d -not -path '*/.*' -maxdepth ${maxdepth} -print | sort
const subdirs = await getDirEntries ( dir_path , { pwd , recursive : true , includeHidden : false , includeDirs : true , includeFiles : false , filter , maxdepth } )
// for each subdirectory, get its hash recursively and store it in the hash list
for ( const subdir of subdirs ) {
// console.log('GETTING SUBDIR HASH', subdir)
// a directory's hash is defined as the hash of all the *files* within (excluding dirs/symlinks/hidden)
const subdir_hashes = await getDirSha256 (
subdir ,
{ pwd , withRoot : true , filter , maxdepth : 0 } ,
)
hashes [ subdir ] = subdir_hashes [ '.' ]
}
// console.log('CALCULATED HASHES FOR ALL SUBDIRS IN DIR', dir_path, subdirs.length)
// filter results if maxdepth is provided
if ( maxdepth >= 0 ) {
for ( const subpath of Object . keys ( hashes ) ) {
if ( pathDepth ( subpath ) > maxdepth ) {
delete hashes [ subpath ]
}
}
}
// console.log('LIMITED OUTPUT DUE TO MAXDEPTH', maxdepth, Object.keys(hashes).length)
// calculate the hash of the root '.' folder by hashing all of hashes of its contents
// EQUIVALENT TO: find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum
if ( withRoot ) {
// pass the first command's output containing the file list + hashes into another sha256
// to get the final hash of the whole directory combined
// console.log('CALCULATING FINAL ROOT HASH for ', dir_path)
// console.log(hashable_summary_str)
hashes [ '.' ] = crypto . createHash ( 'sha256' ) . update ( hashable_summary_str ) . digest ( 'hex' ) as string
// console.log('--->', hashes['.'])
}
return hashes
}
async function getDirInfo ( dir_path , { pwd = null , withRoot = true , withHelpers = true , filter = null , maxdepth = - 1 , subfiles = null } = { } ) {
// get a detailed JSON/dumpable index of a directory's contents, w/ merkle sha256's, sizes, and mimeTypes
// dir_path: path absolute or relative path of the directory you want size info for
// pwd: path (optional) absolute path of the directory you want to interpret dir_path relative to
// withRoot: bool include a summary entry for the root dir_path dir in the list as '.'
// withHelpers: bool attach many extra helper attrs/funcs to results (beyond JSON-serializable core data)
// filter: function (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
// maxdepth: number (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
// subfiles: dirent[] (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
// {
// ...
// 'example.txt': { ... },
// 'foobar/example.mp3': { ... },
// '.': { // this is the fully agumented result when withHelpers=true
// is_file: false,
// is_dir: true,
// filename: '.',
// basename: '1709039915.378868',
// mimeType: 'inode/directory'
// extension: undefined,
// num_bytes: 11540961,
// num_subpaths: 15,
// sha256: '9fc58b3ed887e7139338062ebd49bd6795373759e8acb73d2f7a40f1413789da',
// reldepth: 1,
// relpath: './',
// cwd: '/opt/archivebox/data/archive/1709039915.378868/',
// dirname: '/opt/archivebox/data/archive',
// abspath: '/opt/archivebox/data/archive/1709039915.378868',
// dirent: Stats {
// dev: 16777240,
// mode: 16895,
// uid: 501,
// ...
// mtimeMs: 1717160622956.1357,
// ctimeMs: 1717160622956.1357,
// },
// created: '2024-05-31T13:03:42.956Z',
// modified: '2024-05-31T13:03:42.956Z',
// summary: './data/archive/1709039915.378868 (inode/directory 11541kb 9fc58b3e)',
// helptext: 'Verify these hashes by running:\n' +
// ' cd /opt/archivebox/data/archive/1709039915.378868 \n' +
// " find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum",
// },
// }
pwd = pwd || dir_path
if ( ! dir_path . startsWith ( pwd ) ) {
dir_path = path . join ( pwd , dir_path )
}
// calculate hashes and sizes recursively
const hashes = await getDirSha256 ( dir_path , { pwd , withRoot , filter , maxdepth , subfiles } )
const sizes = await getDirSizes ( dir_path , { pwd , withRoot , filter , maxdepth , subfiles } )
const num_total_subpaths = Object . keys ( hashes ) . filter ( name = > name !== '.' ) . length
const details = { }
for ( const [ filename , sha256 ] of Object . entries ( hashes ) ) {
if ( filename === '.' && ! withRoot ) continue
const abspath = await fs . promises . realpath ( path . join ( dir_path , filename ) )
const dirent = await fs . promises . stat ( abspath )
const num_subpaths = Object . keys ( hashes ) . filter ( subpath = > subpath . startsWith ( filename + '/' ) ) . length
const is_file = dirent . isFile ( )
const is_dir = dirent . isDirectory ( )
// bare-bones info suitable for JSON dumps/exports
const basic_info = {
sha256 ,
num_bytes : sizes [ filename ] ,
created : ( new Date ( dirent . ctimeMs ) ) . toISOString ( ) ,
mimeType : undefined ,
extension : undefined ,
num_subpaths : undefined ,
}
if ( is_dir ) {
basic_info . mimeType = 'inode/directory'
basic_info . extension = undefined
basic_info . num_subpaths = ( filename === '.' ) ? num_total_subpaths : num_subpaths
}
if ( is_file ) {
basic_info . mimeType = mime . lookup ( abspath ) || null
basic_info . extension = path . extname ( filename )
basic_info . num_subpaths = undefined
}
// extra helpers suitable for usage in other areas of the codebase
const info_with_helpers = {
. . . basic_info ,
filename ,
basename : path.basename ( abspath ) ,
dirname : path.dirname ( abspath ) ,
cwd : dir_path ,
relpath : is_dir ? ( filename + '/' ) : filename ,
reldepth : pathDepth ( filename ) ,
abspath ,
is_file ,
is_dir ,
dirent ,
modified : ( new Date ( dirent . mtimeMs ) ) . toISOString ( ) ,
summary : ` ${ prettyPath ( abspath ) } ( ${ basic_info . mimeType } ${ Math . round ( basic_info . num_bytes / 1000 ) } kb ${ sha256 . substring ( 0 , 8 ) } ) ` ,
helptext : undefined ,
}
if ( filename === '.' ) {
info_with_helpers . helptext = ` Verify these hashes by running: \ n cd ${ prettyPath ( abspath ) } \ n find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum `
}
if ( ( typeof filter ) === 'function' ) {
if ( ! filter ( info_with_helpers ) ) continue
}
details [ filename ] = withHelpers ? info_with_helpers : basic_info
}
return details
}
// console.log(await getDirSha256(
// '/opt/archivebox/data/archive/1709039915.378868/',
// {
// withRoot: true,
// maxdepth: -1,
// filter: ({relpath}) => relpath.startsWith('versions'),
// },
// ))
// console.log(await getDirSizes(
// '/opt/archivebox/data/archive/1709039915.378868/',
// {
// withRoot: false,
// maxdepth: 2,
// filter: ({relpath}) => !relpath.startsWith('versions'),
// },
// ))
// console.log(await getDirInfo(
// '/opt/archivebox/data/archive/1709039915.378868/',
// {
// withRoot: true,
// withHelpers: true,
// maxdepth: 1,
// // filter: ({relpath}) => relpath.startsWith('versions'),
// },
// ))
type DetectFilenameOptions = {
url? : string ,
response? : HTTPResponse | Response ,
page? : Page ,
dir? : string ,
abspath? : string ,
filename? : string ,
basename? : string ,
extension? : string ,
mimeType? : string ,
resourceType? : string ,
}
async function detectFilename ( { url , response , page , dir , abspath , filename , basename , extension , mimeType , resourceType } : DetectFilenameOptions ) {
// this function takes a url (and/or response/page), and detects the abspath,dir,filename,basename,extention,mimeType
// from the URL (+ any enforced path components passed in via args)
// example: detectFilename({url: 'https://example.com/favicon.png', extension: 'ico'}) outputs 'favicon.ico'
//
// it has some quirks that are specific to archiving and may not behave as you expect
// e.g. if visiting the url https://example.com/error.zip returns a 500 text/html error page
// this may still save it as a .zip with mimeType=application/x-zip and ignore the response mimeType the url ends in .zip
// however, if the url has no extension, e.g. https://example.com/error it will
// auto-detect the mimeType based on the response and append an extension, saving as error.html
//
// ⚠️ SECURITY WARNING: think carefully about the permissions, shell injection, and RCE implications of any changes made here ⚠️
// this function writes untrusted web content to the filesystem using auto-detected mimetype to co-erce the extension,
// which can be dangerous (e.g. what if one of these downloads is a malicious ransomware .exe, do we really want to give it .exe?
// if we do, how do we make sure it never gets executed? (without damaging the integrity of the copy)
if ( ! ( response || page ) ) throw 'Either a page or a response must be provided in order to detect mimeType & URL'
if ( response && ( typeof response . headers !== 'function' ) ) {
const node_fetch_response : Response = response as Response
response = {
url : ( ) = > node_fetch_response . url ,
headers : ( ) = > node_fetch_response . headers ,
} as unknown as HTTPResponse
}
response = response as HTTPResponse
url = url || response ? . url ( ) || ( await page . url ( ) )
if ( ! url ) throw 'URL was not provided and could not be detected from {response, page}'
// Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Preflight, Other
try {
resourceType = resourceType || response ? . request ( ) ? . resourceType ( )
} catch ( err ) {
// ignore, sometimes response is null/not available
}
const resourceTypeToMimeType = {
'Stylesheet' : 'text/css' ,
'Script' : 'application/x-javascript' ,
'WebSocket' : 'application/json' ,
'Website' : 'text/html' ,
}
mimeType = mimeType || resourceTypeToMimeType [ resourceType ] // guess extension based on request resourceType
extension = extension || ( mimeType ? mime . extension ( mimeType ) : null )
// handle special url cases (e.g. schemes in URL_SCHEMES_IGNORED)
if ( url . startsWith ( 'about:blank' ) ) {
filename = 'about_blank'
mimeType = 'text/html'
}
else if ( url . startsWith ( 'data:' ) ) {
filename = ` data__ ${ hashCode ( url ) } `
}
// console.log('detectFilename>', {url, dir, abspath, filename, basename, extension, mimeType, resourceType})
if ( abspath ) {
if ( dir || filename || basename || extension )
throw '{abspath} should not be passed with other options (e.g. dir, filename, basename, extension)'
var { dir , base : filename , ext : extension , name : basename } = path . parse ( abspath )
// path.parse('/home/user/dir/file.txt') returns:
// { root: '/',
// dir: '/home/user/dir',
// base: 'file.txt',
// ext: '.txt',
// name: 'file' }
} else {
dir = dir || path . resolve ( process . cwd ( ) )
filename = filename // https://example.com/a.1.zip?e.pdf=2#g.h=3 => a.1.zip
|| ( new URL ( url ) ) . pathname . split ( '/' ) . at ( - 1 ) // https://example.com/file124.rss => file124.rss prefers last component of path with no query/hash, falls back to domain name if no path
|| 'index' // https://example.com/abc/def/ => index.html
//|| (new URL(url)).hostname.replaceAll('.', '_') // https://example.com => example_com (but if disabled, this would be index.html)
}
if ( ! filename ) throw 'filename/abspath were not passed and could not be detected from url'
const path_extname = path . extname ( filename )
const resp_mimetype = response && (
( response as any ) . mimeType
|| response . headers ( ) [ 'content-type' ] ? . split ( ';' ) [ 0 ]
|| resourceTypeToMimeType [ resourceType ]
|| 'application/octet-stream'
)
mimeType = mimeType // https://example.com/a.1.zip?e.pdf=2#g.h=3 => application/x-zip prefers mimetype based on extension in path, falls back to response mimeType
|| ( path_extname && mime . lookup ( path_extname ) ) // https://example.com/file124.rss => application/rss+xml
|| resp_mimetype // https://example.com/get?type=png => image/png
extension = extension
|| ( path_extname && path_extname . replace ( '.' , '' ) ) // https://example.com/a.1.zip?e.pdf=2#g.h=3 => zip prefers extension in path, falls back to response mimeType's suggested extension
|| ( resp_mimetype && mime . extension ( resp_mimetype ) ) // https://example.com => html
|| '' // https://example.com/websocket.1 =>
if ( extension . startsWith ( '.' ) )
extension = extension . slice ( 1 )
basename = basename // https://example.com/a.1.zip?e.pdf=2#g.h=3 => a.1 prefers to filename in path (without extension), falls back to domain name
|| ( path . parse ( filename ) . name ) // https://mp4dl.example.com => mp4dl_example_com
basename = basename . slice ( 0 , 120 ) // truncate at 120 characters (leaving 8 chars for .ext)
basename = basename . replace ( /[^a-zA-Z0-9%+?&=@;_ \.-]/g , '' ) // strip characters not allowed in filenames
filename = basename + '.' + extension
if ( filename . endsWith ( '.' ) )
filename = filename . slice ( 0 , - 1 )
abspath = abspath || path . join ( dir , filename )
// console.log('detectFilename<', {url, dir, abspath, filename, basename, extension, mimeType, resourceType})
return {
url ,
dir ,
abspath ,
filename ,
basename ,
extension ,
mimeType ,
resourceType ,
resp_mimetype ,
}
}
interface DowloadOptions extends DetectFilenameOptions {
browser? : Browser
expected_mimetype? : string
timeout? : number
}
async function download ( { url , browser , page , response , dir , abspath , filename , basename , extension , expected_mimetype , timeout } : DowloadOptions ) {
url = url || ( response as HTTPResponse ) ? . url ( ) || ( await page ? . url ( ) )
ALREADY_ARCHIVED . add ( url . slice ( 0 , 4096 ) ) // prevent running whole archive task on tabs we create for just for downloading
browser = browser || ( page && ( await page . browser ( ) ) )
timeout = timeout || 120 _000
expected_mimetype = expected_mimetype || ''
let newPage = null
let errors = [ ]
let num_bytes = 0
let bytesBuffer = null
// if we need to fetch the url (i.e. it's not already been requested)
if ( ! response ) {
if ( ! browser ) throw 'No {browser} or {page} was provided to download with'
newPage = await browser . newPage ( )
if ( page ) await page . bringToFront ( ) // if origin page is provided, make sure it stays in foreground
response = await newPage . goto ( url , { timeout : timeout , waitUntil : 'networkidle0' } )
if ( page ) await page . bringToFront ( ) // if origin page is provided, make sure it stays in foreground
}
url = url || ( response as HTTPResponse ) ? . url ( ) || ( await newPage ? . url ( ) ) || ( await page ? . url ( ) ) ;
const response_mimetype = ( response as HTTPResponse ) . headers ( ) [ 'content-type' ] ? . split ( ';' ) [ 0 ] || 'text/html'
// detect the filename we should write to based on provided url/response/page/filename/extension suggestions
var {
dir ,
abspath ,
filename ,
basename ,
extension ,
mimeType ,
} = await detectFilename ( { url , page , response , dir , abspath , filename , basename , extension , mimeType } )
// if mimeType is passed, make sure response matches expected mimetype, otherwise consider download a failure
if ( ! response_mimetype . startsWith ( expected_mimetype ) ) {
errors . push ( ` Expected ${ expected_mimetype } but got ${ response_mimetype } ` )
} else {
// download the file using puppeteer's response.buffer()
try {
// write the response bytes into the output file
bytesBuffer = await ( response as HTTPResponse ) . buffer ( )
await overwriteFile ( abspath , bytesBuffer )
num_bytes = bytesBuffer . length
} catch ( err ) {
errors . push ( err )
}
// security check to make sure downloaded file is not executable (random binaries downloaded off the internet = dangerous)
fs . access ( abspath , fs . constants . X_OK , ( err ) = > {
if ( ! err ) console . warn (
'[⚠️] SECURITY WARNING: Downloaded file appears to be executable:' , prettyPath ( abspath ) ,
'\n (be careful running untrusted programs downloaded from the internet!)'
)
} )
}
// if we opened a dedicated page for downloading, close it now
if ( newPage ) {
newPage . close ( )
}
if ( errors . length ) {
// console.warn(`[❌] Downloading ${url} (${mimeType}) to ${abspath} failed:`, JSON.stringify(errors, null, 4))
} else {
console . log ( ` [💾] Downloaded ${ url . substring ( 0 , 40 ) } ( ${ num_bytes } ${ mimeType } )... ` . padEnd ( 82 ) , prettyPath ( abspath ) )
}
return {
url , response , errors ,
dir , abspath , filename , basename , extension , mimeType ,
bytesBuffer , num_bytes ,
}
}
/************************** Puppeteer Launching *******************************/
async function startCluster ( puppeteer , args = CHROME_ARGS_DEFAULT ) {
console . log ( ` [🎭] Launching ${ CHROME_CLUSTER_WORKERS } x Chromium browsers with puppeteer-cluster: ` . padEnd ( 82 ) , prettyPath ( CHROME_PROFILE_PATH ) )
const cluster = await Cluster . launch ( {
puppeteer ,
monitor : true ,
maxConcurrency : CHROME_CLUSTER_WORKERS ,
sameDomainDelay : 2550 ,
workerCreationDelay : 250 ,
timeout : 300_000 , // total ms timeout for an entire task (1000ms * 60s * 5m)
concurrency : Cluster.CONCURRENCY_PAGE , // share cookies between all tabs in a given browser
puppeteerOptions : {
args , // all the chrome launch CLI args
ignoreDefaultArgs : true , // trust me, we have enough args already...
// dumpio: true, // full debug log output, super noisy
}
} )
console . log ( '*************************************************************************' )
return cluster
}
async function remoteBrowser ( puppeteer , { browserURL , browserWSEndpoint } ) {
console . log ( '[🎭] Connecting Puppeteer to existing Chromium browser via:' , browserURL || browserWSEndpoint )
let completed_initial_connection = false
const browser = await puppeteer . connect ( { browserURL , browserWSEndpoint , defaultViewport : null , targetFilter : ( ) = > completed_initial_connection } )
completed_initial_connection = true
console . log ( '*************************************************************************' )
return browser
}
async function startBrowser ( puppeteer , args = CHROME_ARGS_DEFAULT ) {
console . log ( '[🎭] Launching Puppeteer Chromium browser...' . padEnd ( 82 + 1 ) , prettyPath ( CHROME_PROFILE_PATH ) )
const browser = await puppeteer . launch ( { ignoreDefaultArgs : true , args , dumpio : true } )
globalThis . browser = browser
console . log ( '*************************************************************************' )
// store all active tabs on global var by url for easier vscode interactive debugging
const storeTabForDebugger = async ( target ) = > {
try {
globalThis . tabs = globalThis . tabs || { }
const url = target . url ( )
const page = await target . page ( )
if ( ! page || page ? . isClosed ( ) ) {
delete globalThis . tabs [ url ]
} else {
globalThis . tab = page
globalThis . tabs [ url ] = page
}
} catch ( err ) { console . warn ( err ) }
}
browser . on ( 'targetcreated' , storeTabForDebugger )
browser . on ( 'targetchanged' , storeTabForDebugger )
browser . on ( 'targetdestroyed' , storeTabForDebugger )
// wait for initial extension background.js/service worker targets to load
await wait ( 3 _000 )
// prime the extensions cache
const extensions = await getChromeExtensionsFromCache ( { browser } )
globalThis . extensions = extensions // for easier debugging only
// give the user 2min to check any issues with the initial startup pages (bot profile pages),
// solve captchas, re-login, etc. then close them after that to save resources
const startup_pages = ( await browser . pages ( ) )
const startup_page_close_delay = 120 _000
setTimeout ( async ( ) = > {
for ( const page of startup_pages ) {
try { await page . close ( ) } catch ( err ) { /* page may already be closed by now, which is fine */ }
}
} , startup_page_close_delay )
// setup any extensions that need final runtime configuration using their options pages
// await setup2CaptchaExtension({browser, extensions})
// open a placeholder page so browser window stays open when there are no active archiving pages
// (it's annoying to have the entire window open/close/open/close/etc every time an archive task runs)
const empty_page = await browser . newPage ( )
await wait ( 250 )
await empty_page . goto ( 'chrome://version' )
await wait ( 500 )
console . log ( '*************************************************************************' )
return browser
}
async function startAPIServer ( port = API_SERVER_PORT , host = API_SERVER_HOST , taskCallback = null ) {
// taskCallback should be an async function that takes ({url}) => and does something with it
assert ( taskCallback && ( typeof taskCallback === 'function' ) )
const server = createServer ( async ( req , res ) = > {
if ( req . method === 'POST' ) {
console . log ( ` [API][POST] ${ req . url } ` )
let body = '' ;
req . on ( 'data' , ( chunk ) = > {
body += chunk ;
} ) ;
req . on ( 'end' , ( ) = > {
try {
const jsonData = JSON . parse ( body ) ;
// Process the JSON data
console . log ( jsonData ) ;
res . writeHead ( 200 , { 'Content-Type' : 'application/json' } ) ;
res . end ( JSON . stringify ( { message : 'JSON data received' } ) ) ;
} catch ( error ) {
res . writeHead ( 400 , { 'Content-Type' : 'application/json' } ) ;
res . end ( JSON . stringify ( { error : 'Invalid JSON data' } ) ) ;
}
} ) ;
} else if ( req . method === 'GET' ) {
console . log ( ` [API][GET] ${ req . url } ` )
const parsedUrl = new URL ( ` http:// ${ host } : ${ port } ${ req . url } ` )
const query = new URLSearchParams ( parsedUrl . search ) ;
const url = query . get ( 'url' ) ;
if ( url && url . includes ( '://' ) ) {
res . writeHead ( 200 , { 'Content-Type' : 'text/plain' } ) ;
try {
await taskCallback ( { url } )
res . end ( ` ${ url } \ n ${ TASK_PATH ( url ) } ` ) ;
} catch ( err ) {
res . end ( ` ${ url } \ n ${ TASK_PATH ( url ) } \ n ${ err } ` ) ;
}
} else {
res . writeHead ( 500 , { 'Content-Type' : 'text/plain' } ) ;
res . end ( ` Bad URL: ${ url } \ n \ nExpected: /?url=https://example.com/url/to/archive ` ) ;
}
} else {
res . writeHead ( 405 , { 'Content-Type' : 'application/json' } ) ;
res . end ( JSON . stringify ( { error : 'Method not allowed' } ) ) ;
}
} )
server . listen ( port , host , ( ) = > {
console . log ( ` [🎰] API Server listening for requests on http:// ${ host } : ${ port } /?url=... ` ) ;
} )
console . log ( '*************************************************************************' )
return server
}
async function main ( urls , cluster = CHROME_CLUSTER ) {
process . chdir ( DATA_DIR )
const extensions = await getChromeExtensionsFromPersona ( { CHROME_EXTENSIONS , CHROME_EXTENSIONS_DIR } )
const args = getChromeArgs ( { . . . CHROME_LAUNCH_OPTIONS , CHROME_EXTENSIONS : extensions } )
const preferences = getChromePreferences ( { CHROME_PREFERENCES_DEFAULT , CHROME_PREFERENCES_EXTRA , CHROME_DOWNLOADS_DIR , CHROME_EXTENSIONS : extensions } )
const Puppeteer = applyChromePreferences ( PupeteerExtra , CHROME_PREFERENCES_PATH , preferences )
Puppeteer . use ( StealthPlugin ( ) ) ;
// Puppeteer.use(ReplPlugin());
// handled by uBlock Origin & ReCaptcha browser extensions, probably not needed here anymore:
// Puppeteer.use(RecaptchaPlugin({
// provider: {id: '2captcha', token: API_KEY_2CAPTCHA},
// visualFeedback: true,
// }))
// const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
// puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
if ( cluster ) {
// launch browser with multiple tabs w/ puppeteer
const cluster = await startCluster ( Puppeteer , args )
const handleTask = async ( { url } ) = > cluster . queue ( url , botArchiveTask )
const server = await startAPIServer ( API_SERVER_PORT , API_SERVER_HOST , handleTask )
console . log ( '[📋] Running tasks in parallel with puppeteer cluster...' )
for ( const url of urls ) {
if ( fs . existsSync ( path . join ( TASK_PATH ( url ) , 'aiqa.json' ) ) ) {
try {
JSON . parse ( ( await fs . promises . readFile ( path . join ( TASK_PATH ( url ) , 'aiqa.json' ) ) ) . toString ( ) )
console . log ( ' skipping (already present):' , TASK_PATH ( url ) , url )
continue
} catch ( err ) {
// pass
}
}
cluster . queue ( url , botArchiveTask )
await wait ( 3 _000 )
}
await cluster . idle ( ) ;
await cluster . close ( ) ;
} else {
// launch single new browser w/ puppeter / connect to remote CDP browser w/ puppeteer
const browser = await startBrowser ( Puppeteer , args )
// const browser = await remoteBrowser(Puppeteer, {browserURL, browserWSEndpoint})
// run speedtest in the background
speedtest ( { browser } )
const handleTask = async ( { url } ) = > await botArchiveTask ( { page : ( await browser . newPage ( ) ) , data : url } )
const server = await startAPIServer ( API_SERVER_PORT , API_SERVER_HOST , handleTask )
// wait for any pre-run setup tasks or server requests
await wait ( 5 _000 )
let num_succeeded = 0
let num_failed = 0
console . log ( ` [📋] Running ${ urls . length } tasks sequentially with puppeteer browser... ` )
for ( const url of urls ) {
const run_count = ( num_succeeded + num_failed ) || 1
// check if task should be run or skipped based on existing snapshot data present in directory
const metrics_path = path . join ( TASK_PATH ( url ) , 'metrics.json' )
const screenshot_path = path . join ( TASK_PATH ( url ) , 'screenrecording.gif' )
const aiqa_path = path . join ( TASK_PATH ( url ) , 'aiqa.json' )
const versions_path = path . join ( TASK_PATH ( url ) , 'versions' )
if ( fs . existsSync ( metrics_path ) && fs . existsSync ( screenshot_path ) && fs . existsSync ( aiqa_path ) && fs . existsSync ( versions_path ) ) {
try {
const ai_qa_result = JSON . parse ( await fs . promises . readFile ( aiqa_path , 'utf-8' ) )
console . log ( prettyPath ( TASK_PATH ( url ) ) , ` ${ ai_qa_result . pct_visible } % ` , ai_qa_result . website_brand_name , url . substring ( 0 , 80 ) )
assert ( ai_qa_result . website_brand_name )
continue
} catch ( err ) {
// pass
}
}
let delay = 0
// create a new browser page and run the archiving task
const page = ( await browser . newPage ( ) )
try {
console . log ( ANSI . black + ` ◤==============================================================================[ ${ String ( run_count ) . padStart ( 3 ) } ]/[ ${ urls . length } ]◥ ` + ANSI . reset )
await botArchiveTask ( { page , data : url } )
delay = 1 _000
num_succeeded += 1
} catch ( err ) {
console . error ( '[❌] Archiving task failed!' , url )
console . error ( err )
num_failed += 1
delay = 15 _000 // extra delay if there are errors
}
console . log ( ANSI . black + ` ◣==============================================================================[☑ ${ num_succeeded } ][🆇 ${ num_failed } ]◢ ` + ANSI . reset )
// check for abnormally high failure rates and exit early if needed
const failure_pct = Math . round ( ( num_failed / run_count ) * 100 )
if ( failure_pct > 50 ) {
if ( run_count > 5 ) {
console . warn ( ` [⚠️] ${ failure_pct } % Task failure rate is very high! Will self-cancel after 10 URLs if >50% continue to fail... ` )
}
if ( run_count > 10 ) {
throw ` Too many tasks failed in a row! Quitting early after ${ run_count } / ${ urls . length } tasks. `
}
}
// increase the delay between tasks based on the ratio of how many are failing:succeeding
delay = Math . pow ( 4 , ( num_failed / ( num_succeeded + 3 ) ) ) * delay
// e.g. 0:1 failure ratio == 1 * delay == 1 ~ 15s
// 1:1 failure ratio == 5 * delay == 5 ~ 1m ... 5^(failed:succeeded) exponential increase
// 2:1 failure ratio == 25 * delay == 25s ~ 6m
// 3:1 failure ratio == 125 * delay == 2m ~ 31m
// etc...
// up to 1hr+
delay = Math . min ( delay , 3 _600_000 ) // 1hr maximum delay between tasks
delay = Math . max ( delay , 1 _000 ) // 1s minimum delay between tasks
if ( delay > 2 _500 ) {
console . log ( '... waiting' , Math . round ( delay / 1000 ) , 'seconds (self rate-limit)...' )
}
await wait ( delay ) // base ratelimit
console . log ( )
}
if ( PASSIVE_ARCHIVING ) {
// replace these as-needed:
const browserURL = 'http://localhost:9222/'
const browserWSEndpoint = 'ws://localhost:9222/devtools/browser'
const driver_browser = browser || await remoteBrowser ( Puppeteer , { browserURL , browserWSEndpoint } )
const archiver_browser = { } //await startBrowser(Puppeteer, args)
const extensions = await getChromeExtensionsFromCache ( { browser : driver_browser } )
// close both browsers if either one is closed
let browser_is_open = true
driver_browser . on ( 'disconnected' , async ( ) = > { browser_is_open = false } ) // await archiver_browser.close()
// archiver_browser.on('disconnected', async () => {browser_is_open = false; await driver_browser.close()})
// handle any tab navigation to a new URL in the driver browser
const handleUserNavigation = async ( target ) = > {
const url = target . url ( )
const page = await target . page ( )
// const client = await target.createCDPSession()
if ( target . type ( ) == 'page' && page && url ) {
console . log ( ANSI . black + '==============================================================================' + ANSI . reset )
console . warn ( '[➕ ] DRIVER BROWSER NAVIGATED:' , ANSI . blue , url , ANSI . reset )
try {
await passiveArchiveTask ( { browser : driver_browser , page , url } )
await wait ( 3 _000 )
} catch ( err ) {
console . error ( '[❌] Archiving task failed!' , url )
console . error ( err )
await wait ( 10 _000 ) // base ratelimit
}
console . log ( ANSI . black + '==============================================================================' + ANSI . reset )
// await client.send('Page.enable')
// await client.send('Page.setWebLifecycleState', {state: 'active'})
}
// await client.send('Runtime.runIfWaitingForDebugger')
}
// setup handler to archive new page whenever one is opened
driver_browser . on ( 'targetcreated' , handleUserNavigation )
driver_browser . on ( 'targetchanged' , handleUserNavigation )
console . log ( '------------------------------------------------------' )
console . log ( '[👀] Waiting for browser tabs to be opened by human...' )
while ( browser_is_open ) {
await wait ( 2 _000 )
}
} else {
while ( true ) {
await wait ( 2 _000 )
}
}
await browser . close ( )
}
console . log ( '[✅] Finished all tasks and stopped browsers.' )
process . exit ( 0 ) ;
}
/******************************************************************************/
if ( import . meta . main ) {
main ( URLS ) . catch ( console . error ) ;
}
/******************************************************************************/
// if we want to handle CLI args in the future, minimist is great:
// var argv = require('minimist')(process.argv.slice(2));
// console.log(argv); // --url=https://example.com --binpath=/browsers/chromium-1047/bin/chromium --datadir=/Chromium
// const {url, binpath, datadir} = argv;
// OLD CODE, may be useful in the future if we need audio in screenrecordings:
// async function setupScreenrecordingWithAudio(page, wss) {
// console.log('[🎬] Setting up screen-recording plugin...');
// const stream_port = (await wss).options.port;
// // streamPage = await (page.browser()).newPage()
// await page.goto(`chrome-extension://jjndjgheafjngoipoacpjgeicjeomjli/options.html#${stream_port}`)
//
// // puppeteer-stream recording start
// streamFile = fs.createWriteStream(SCREENRECORDING_PATH(page))
// stream = await getStream(page, {
// audio: true,
// video: true,
// bitsPerSecond: 8000000, // 1080p video
// });
// stream.pipe(streamFile);
// return {stream, streamFile}
//
// // puppeteer-stream recording stop & cleanup
// if (stream && streamFile) {
// await stream?.destroy();
// streamFile?.close();
// // await streamPage.close();
// }
// }