mirror of
https://github.com/zebrajr/ArchiveBox.git
synced 2026-01-15 12:15:10 +00:00
'Refactored by Sourcery'
This commit is contained in:
@@ -19,6 +19,7 @@ Documentation:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
__package__ = 'archivebox'
|
__package__ = 'archivebox'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -67,13 +68,17 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||||||
'SHELL_CONFIG': {
|
'SHELL_CONFIG': {
|
||||||
'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
|
'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
|
||||||
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
|
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
|
||||||
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now
|
'SHOW_PROGRESS': {
|
||||||
|
'type': bool,
|
||||||
|
'default': lambda c: (
|
||||||
|
c['IS_TTY'] and platform.system() != 'Darwin'
|
||||||
|
),
|
||||||
|
}, # progress bars are buggy on mac, disable for now
|
||||||
'IN_DOCKER': {'type': bool, 'default': False},
|
'IN_DOCKER': {'type': bool, 'default': False},
|
||||||
'PUID': {'type': int, 'default': os.getuid()},
|
'PUID': {'type': int, 'default': os.getuid()},
|
||||||
'PGID': {'type': int, 'default': os.getgid()},
|
'PGID': {'type': int, 'default': os.getgid()},
|
||||||
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
|
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
|
||||||
},
|
},
|
||||||
|
|
||||||
'GENERAL_CONFIG': {
|
'GENERAL_CONFIG': {
|
||||||
'OUTPUT_DIR': {'type': str, 'default': None},
|
'OUTPUT_DIR': {'type': str, 'default': None},
|
||||||
'CONFIG_FILE': {'type': str, 'default': None},
|
'CONFIG_FILE': {'type': str, 'default': None},
|
||||||
@@ -82,21 +87,36 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||||||
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
|
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
|
||||||
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
|
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
|
||||||
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
|
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
|
||||||
'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
|
'URL_DENYLIST': {
|
||||||
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
|
'type': str,
|
||||||
|
'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$',
|
||||||
|
'aliases': ('URL_BLACKLIST',),
|
||||||
|
}, # to avoid downloading code assets as their own pages
|
||||||
|
'URL_ALLOWLIST': {
|
||||||
|
'type': str,
|
||||||
|
'default': None,
|
||||||
|
'aliases': ('URL_WHITELIST',),
|
||||||
|
},
|
||||||
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
|
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
|
||||||
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
|
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
|
||||||
},
|
},
|
||||||
|
|
||||||
'SERVER_CONFIG': {
|
'SERVER_CONFIG': {
|
||||||
'SECRET_KEY': {'type': str, 'default': None},
|
'SECRET_KEY': {'type': str, 'default': None},
|
||||||
'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
|
'BIND_ADDR': {
|
||||||
|
'type': str,
|
||||||
|
'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][
|
||||||
|
c['IN_DOCKER']
|
||||||
|
],
|
||||||
|
},
|
||||||
'ALLOWED_HOSTS': {'type': str, 'default': '*'},
|
'ALLOWED_HOSTS': {'type': str, 'default': '*'},
|
||||||
'DEBUG': {'type': bool, 'default': False},
|
'DEBUG': {'type': bool, 'default': False},
|
||||||
'PUBLIC_INDEX': {'type': bool, 'default': True},
|
'PUBLIC_INDEX': {'type': bool, 'default': True},
|
||||||
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
|
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
|
||||||
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
|
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
|
||||||
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
|
'FOOTER_INFO': {
|
||||||
|
'type': str,
|
||||||
|
'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',
|
||||||
|
},
|
||||||
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
|
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
|
||||||
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
|
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
|
||||||
'TIME_ZONE': {'type': str, 'default': 'UTC'},
|
'TIME_ZONE': {'type': str, 'default': 'UTC'},
|
||||||
@@ -107,44 +127,114 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||||||
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
|
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
|
||||||
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
|
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
|
||||||
},
|
},
|
||||||
|
|
||||||
'ARCHIVE_METHOD_TOGGLES': {
|
'ARCHIVE_METHOD_TOGGLES': {
|
||||||
'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)},
|
'SAVE_TITLE': {
|
||||||
'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
|
'type': bool,
|
||||||
'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
|
'default': True,
|
||||||
'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
|
'aliases': ('FETCH_TITLE',),
|
||||||
'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
|
},
|
||||||
'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)},
|
'SAVE_FAVICON': {
|
||||||
'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)},
|
'type': bool,
|
||||||
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
|
'default': True,
|
||||||
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
|
'aliases': ('FETCH_FAVICON',),
|
||||||
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
|
},
|
||||||
'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)},
|
'SAVE_WGET': {
|
||||||
'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
|
'type': bool,
|
||||||
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
|
'default': True,
|
||||||
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
|
'aliases': ('FETCH_WGET',),
|
||||||
'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
|
},
|
||||||
'SAVE_ALLOWLIST': {'type': dict, 'default': {},},
|
'SAVE_WGET_REQUISITES': {
|
||||||
'SAVE_DENYLIST': {'type': dict, 'default': {},},
|
'type': bool,
|
||||||
|
'default': True,
|
||||||
|
'aliases': ('FETCH_WGET_REQUISITES',),
|
||||||
|
},
|
||||||
|
'SAVE_SINGLEFILE': {
|
||||||
|
'type': bool,
|
||||||
|
'default': True,
|
||||||
|
'aliases': ('FETCH_SINGLEFILE',),
|
||||||
|
},
|
||||||
|
'SAVE_READABILITY': {
|
||||||
|
'type': bool,
|
||||||
|
'default': True,
|
||||||
|
'aliases': ('FETCH_READABILITY',),
|
||||||
|
},
|
||||||
|
'SAVE_MERCURY': {
|
||||||
|
'type': bool,
|
||||||
|
'default': True,
|
||||||
|
'aliases': ('FETCH_MERCURY',),
|
||||||
|
},
|
||||||
|
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
|
||||||
|
'SAVE_SCREENSHOT': {
|
||||||
|
'type': bool,
|
||||||
|
'default': True,
|
||||||
|
'aliases': ('FETCH_SCREENSHOT',),
|
||||||
|
},
|
||||||
|
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
|
||||||
|
'SAVE_HEADERS': {
|
||||||
|
'type': bool,
|
||||||
|
'default': True,
|
||||||
|
'aliases': ('FETCH_HEADERS',),
|
||||||
|
},
|
||||||
|
'SAVE_WARC': {
|
||||||
|
'type': bool,
|
||||||
|
'default': True,
|
||||||
|
'aliases': ('FETCH_WARC',),
|
||||||
|
},
|
||||||
|
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
|
||||||
|
'SAVE_MEDIA': {
|
||||||
|
'type': bool,
|
||||||
|
'default': True,
|
||||||
|
'aliases': ('FETCH_MEDIA',),
|
||||||
|
},
|
||||||
|
'SAVE_ARCHIVE_DOT_ORG': {
|
||||||
|
'type': bool,
|
||||||
|
'default': True,
|
||||||
|
'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',),
|
||||||
|
},
|
||||||
|
'SAVE_ALLOWLIST': {
|
||||||
|
'type': dict,
|
||||||
|
'default': {},
|
||||||
|
},
|
||||||
|
'SAVE_DENYLIST': {
|
||||||
|
'type': dict,
|
||||||
|
'default': {},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
'ARCHIVE_METHOD_OPTIONS': {
|
'ARCHIVE_METHOD_OPTIONS': {
|
||||||
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
|
'RESOLUTION': {
|
||||||
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
|
'type': str,
|
||||||
|
'default': '1440,2000',
|
||||||
|
'aliases': ('SCREENSHOT_RESOLUTION',),
|
||||||
|
},
|
||||||
|
'GIT_DOMAINS': {
|
||||||
|
'type': str,
|
||||||
|
'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com',
|
||||||
|
},
|
||||||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||||
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
||||||
|
'CURL_USER_AGENT': {
|
||||||
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
'type': str,
|
||||||
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}',
|
||||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
},
|
||||||
|
'WGET_USER_AGENT': {
|
||||||
|
'type': str,
|
||||||
|
'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}',
|
||||||
|
},
|
||||||
|
'CHROME_USER_AGENT': {
|
||||||
|
'type': str,
|
||||||
|
'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)',
|
||||||
|
},
|
||||||
'COOKIES_FILE': {'type': str, 'default': None},
|
'COOKIES_FILE': {'type': str, 'default': None},
|
||||||
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
|
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
|
||||||
|
|
||||||
'CHROME_TIMEOUT': {'type': int, 'default': 0},
|
'CHROME_TIMEOUT': {'type': int, 'default': 0},
|
||||||
'CHROME_HEADLESS': {'type': bool, 'default': True},
|
'CHROME_HEADLESS': {'type': bool, 'default': True},
|
||||||
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
|
'CHROME_SANDBOX': {
|
||||||
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
|
'type': bool,
|
||||||
|
'default': lambda c: not c['IN_DOCKER'],
|
||||||
|
},
|
||||||
|
'YOUTUBEDL_ARGS': {
|
||||||
|
'type': list,
|
||||||
|
'default': lambda c: [
|
||||||
'--write-description',
|
'--write-description',
|
||||||
'--write-info-json',
|
'--write-info-json',
|
||||||
'--write-annotations',
|
'--write-annotations',
|
||||||
@@ -152,43 +242,41 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||||||
'--no-call-home',
|
'--no-call-home',
|
||||||
'--write-sub',
|
'--write-sub',
|
||||||
'--all-subs',
|
'--all-subs',
|
||||||
# There are too many of these and youtube
|
|
||||||
# throttles you with HTTP error 429
|
|
||||||
#'--write-auto-subs',
|
|
||||||
'--convert-subs=srt',
|
'--convert-subs=srt',
|
||||||
'--yes-playlist',
|
'--yes-playlist',
|
||||||
'--continue',
|
'--continue',
|
||||||
# This flag doesn't exist in youtube-dl
|
|
||||||
# only in yt-dlp
|
|
||||||
'--no-abort-on-error',
|
'--no-abort-on-error',
|
||||||
# --ignore-errors must come AFTER
|
|
||||||
# --no-abort-on-error
|
|
||||||
# https://github.com/yt-dlp/yt-dlp/issues/4914
|
|
||||||
'--ignore-errors',
|
'--ignore-errors',
|
||||||
'--geo-bypass',
|
'--geo-bypass',
|
||||||
'--add-metadata',
|
'--add-metadata',
|
||||||
'--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
|
f"--max-filesize={c['MEDIA_MAX_SIZE']}",
|
||||||
]},
|
],
|
||||||
|
},
|
||||||
|
'WGET_ARGS': {
|
||||||
'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
|
'type': list,
|
||||||
|
'default': [
|
||||||
|
'--no-verbose',
|
||||||
'--adjust-extension',
|
'--adjust-extension',
|
||||||
'--convert-links',
|
'--convert-links',
|
||||||
'--force-directories',
|
'--force-directories',
|
||||||
'--backup-converted',
|
'--backup-converted',
|
||||||
'--span-hosts',
|
'--span-hosts',
|
||||||
'--no-parent',
|
'--no-parent',
|
||||||
'-e', 'robots=off',
|
'-e',
|
||||||
]},
|
'robots=off',
|
||||||
'CURL_ARGS': {'type': list, 'default': ['--silent',
|
],
|
||||||
'--location',
|
},
|
||||||
'--compressed'
|
'CURL_ARGS': {
|
||||||
]},
|
'type': list,
|
||||||
|
'default': ['--silent', '--location', '--compressed'],
|
||||||
|
},
|
||||||
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
||||||
'SINGLEFILE_ARGS': {'type': list, 'default': None},
|
'SINGLEFILE_ARGS': {'type': list, 'default': None},
|
||||||
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
|
'FAVICON_PROVIDER': {
|
||||||
|
'type': str,
|
||||||
|
'default': 'https://www.google.com/s2/favicons?domain={}',
|
||||||
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
'SEARCH_BACKEND_CONFIG': {
|
'SEARCH_BACKEND_CONFIG': {
|
||||||
'USE_INDEXING_BACKEND': {'type': bool, 'default': True},
|
'USE_INDEXING_BACKEND': {'type': bool, 'default': True},
|
||||||
'USE_SEARCHING_BACKEND': {'type': bool, 'default': True},
|
'USE_SEARCHING_BACKEND': {'type': bool, 'default': True},
|
||||||
@@ -201,7 +289,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||||||
'SONIC_BUCKET': {'type': str, 'default': 'snapshots'},
|
'SONIC_BUCKET': {'type': str, 'default': 'snapshots'},
|
||||||
'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90},
|
'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90},
|
||||||
},
|
},
|
||||||
|
|
||||||
'DEPENDENCY_CONFIG': {
|
'DEPENDENCY_CONFIG': {
|
||||||
'USE_CURL': {'type': bool, 'default': True},
|
'USE_CURL': {'type': bool, 'default': True},
|
||||||
'USE_WGET': {'type': bool, 'default': True},
|
'USE_WGET': {'type': bool, 'default': True},
|
||||||
@@ -213,19 +300,26 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||||||
'USE_NODE': {'type': bool, 'default': True},
|
'USE_NODE': {'type': bool, 'default': True},
|
||||||
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
||||||
'USE_RIPGREP': {'type': bool, 'default': True},
|
'USE_RIPGREP': {'type': bool, 'default': True},
|
||||||
|
|
||||||
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
||||||
'GIT_BINARY': {'type': str, 'default': 'git'},
|
'GIT_BINARY': {'type': str, 'default': 'git'},
|
||||||
'WGET_BINARY': {'type': str, 'default': 'wget'},
|
'WGET_BINARY': {'type': str, 'default': 'wget'},
|
||||||
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
'SINGLEFILE_BINARY': {
|
||||||
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
'type': str,
|
||||||
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
|
'default': lambda c: bin_path('single-file'),
|
||||||
|
},
|
||||||
|
'READABILITY_BINARY': {
|
||||||
|
'type': str,
|
||||||
|
'default': lambda c: bin_path('readability-extractor'),
|
||||||
|
},
|
||||||
|
'MERCURY_BINARY': {
|
||||||
|
'type': str,
|
||||||
|
'default': lambda c: bin_path('mercury-parser'),
|
||||||
|
},
|
||||||
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
|
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
|
||||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||||
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
||||||
'CHROME_BINARY': {'type': str, 'default': None},
|
'CHROME_BINARY': {'type': str, 'default': None},
|
||||||
|
|
||||||
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
||||||
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
||||||
},
|
},
|
||||||
@@ -276,7 +370,7 @@ DEFAULT_CLI_COLORS = {
|
|||||||
'white': '\033[01;37m',
|
'white': '\033[01;37m',
|
||||||
'black': '\033[01;30m',
|
'black': '\033[01;30m',
|
||||||
}
|
}
|
||||||
ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
|
ANSI = {k: '' for k in DEFAULT_CLI_COLORS}
|
||||||
|
|
||||||
COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
|
COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
|
||||||
'00': [(0, 0, 0), (0, 0, 0)],
|
'00': [(0, 0, 0), (0, 0, 0)],
|
||||||
@@ -519,15 +613,11 @@ def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]:
|
|||||||
config_file = ConfigParser()
|
config_file = ConfigParser()
|
||||||
config_file.optionxform = str
|
config_file.optionxform = str
|
||||||
config_file.read(config_path)
|
config_file.read(config_path)
|
||||||
# flatten into one namespace
|
return {
|
||||||
config_file_vars = {
|
|
||||||
key.upper(): val
|
key.upper(): val
|
||||||
for section, options in config_file.items()
|
for section, options in config_file.items()
|
||||||
for key, val in options.items()
|
for key, val in options.items()
|
||||||
}
|
}
|
||||||
# print('[i] Loaded config file', os.path.abspath(config_path))
|
|
||||||
# print(config_file_vars)
|
|
||||||
return config_file_vars
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@@ -536,6 +626,10 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
|
|||||||
|
|
||||||
from .system import atomic_write
|
from .system import atomic_write
|
||||||
|
|
||||||
|
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
|
||||||
|
config_path = Path(out_dir) / CONFIG_FILENAME
|
||||||
|
|
||||||
|
if not config_path.exists():
|
||||||
CONFIG_HEADER = (
|
CONFIG_HEADER = (
|
||||||
"""# This is the config file for your ArchiveBox collection.
|
"""# This is the config file for your ArchiveBox collection.
|
||||||
#
|
#
|
||||||
@@ -550,10 +644,6 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
|
|||||||
|
|
||||||
""")
|
""")
|
||||||
|
|
||||||
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
|
|
||||||
config_path = Path(out_dir) / CONFIG_FILENAME
|
|
||||||
|
|
||||||
if not config_path.exists():
|
|
||||||
atomic_write(config_path, CONFIG_HEADER)
|
atomic_write(config_path, CONFIG_HEADER)
|
||||||
|
|
||||||
config_file = ConfigParser()
|
config_file = ConfigParser()
|
||||||
@@ -568,10 +658,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
|
|||||||
# Set up sections in empty config file
|
# Set up sections in empty config file
|
||||||
for key, val in config.items():
|
for key, val in config.items():
|
||||||
section = find_section(key)
|
section = find_section(key)
|
||||||
if section in config_file:
|
existing_config = dict(config_file[section]) if section in config_file else {}
|
||||||
existing_config = dict(config_file[section])
|
|
||||||
else:
|
|
||||||
existing_config = {}
|
|
||||||
config_file[section] = {**existing_config, key: val}
|
config_file[section] = {**existing_config, key: val}
|
||||||
|
|
||||||
# always make sure there's a SECRET_KEY defined for Django
|
# always make sure there's a SECRET_KEY defined for Django
|
||||||
@@ -604,10 +691,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
|
|||||||
if Path(f'{config_path}.bak').exists():
|
if Path(f'{config_path}.bak').exists():
|
||||||
os.remove(f'{config_path}.bak')
|
os.remove(f'{config_path}.bak')
|
||||||
|
|
||||||
return {
|
return {key.upper(): CONFIG.get(key.upper()) for key in config}
|
||||||
key.upper(): CONFIG.get(key.upper())
|
|
||||||
for key in config.keys()
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -637,7 +721,7 @@ def load_config(defaults: ConfigDefaultDict,
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
stderr()
|
stderr()
|
||||||
stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
|
stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
|
||||||
stderr(' {}: {}'.format(e.__class__.__name__, e))
|
stderr(f' {e.__class__.__name__}: {e}')
|
||||||
stderr()
|
stderr()
|
||||||
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
|
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
|
||||||
stderr()
|
stderr()
|
||||||
@@ -683,7 +767,7 @@ def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Op
|
|||||||
else:
|
else:
|
||||||
stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
|
stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
|
||||||
for line in text[1:]:
|
for line in text[1:]:
|
||||||
stderr('{} {}'.format(prefix, line))
|
stderr(f'{prefix} {line}')
|
||||||
|
|
||||||
|
|
||||||
# Dependency Metadata Helpers
|
# Dependency Metadata Helpers
|
||||||
@@ -754,8 +838,7 @@ def find_chrome_binary() -> Optional[str]:
|
|||||||
'google-chrome-dev',
|
'google-chrome-dev',
|
||||||
)
|
)
|
||||||
for name in default_executable_paths:
|
for name in default_executable_paths:
|
||||||
full_path_exists = shutil.which(name)
|
if full_path_exists := shutil.which(name):
|
||||||
if full_path_exists:
|
|
||||||
return name
|
return name
|
||||||
|
|
||||||
return None
|
return None
|
||||||
@@ -793,7 +876,7 @@ def wget_supports_compression(config):
|
|||||||
"--help",
|
"--help",
|
||||||
]
|
]
|
||||||
return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
|
return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
|
||||||
except (FileNotFoundError, OSError):
|
except OSError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
||||||
@@ -1104,7 +1187,9 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
|
|||||||
if '/Default' in str(config['CHROME_USER_DATA_DIR']):
|
if '/Default' in str(config['CHROME_USER_DATA_DIR']):
|
||||||
stderr()
|
stderr()
|
||||||
stderr(' Try removing /Default from the end e.g.:')
|
stderr(' Try removing /Default from the end e.g.:')
|
||||||
stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
|
stderr(
|
||||||
|
f""" CHROME_USER_DATA_DIR="{config['CHROME_USER_DATA_DIR'].split('/Default')[0]}\""""
|
||||||
|
)
|
||||||
raise SystemExit(2)
|
raise SystemExit(2)
|
||||||
|
|
||||||
|
|
||||||
@@ -1117,11 +1202,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
|||||||
stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
|
stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
|
||||||
for dependency, info in invalid_dependencies:
|
for dependency, info in invalid_dependencies:
|
||||||
stderr(
|
stderr(
|
||||||
' ! {}: {} ({})'.format(
|
f" ! {dependency}: {info['path'] or 'unable to find binary'} ({info['version'] or 'unable to detect version'})"
|
||||||
dependency,
|
|
||||||
info['path'] or 'unable to find binary',
|
|
||||||
info['version'] or 'unable to detect version',
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
|
if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
|
||||||
hint(('To install all packages automatically run: archivebox setup',
|
hint(('To install all packages automatically run: archivebox setup',
|
||||||
@@ -1178,9 +1259,9 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO
|
|||||||
output_dir = out_dir or config['OUTPUT_DIR']
|
output_dir = out_dir or config['OUTPUT_DIR']
|
||||||
from .index.sql import list_migrations
|
from .index.sql import list_migrations
|
||||||
|
|
||||||
pending_migrations = [name for status, name in list_migrations() if not status]
|
if pending_migrations := [
|
||||||
|
name for status, name in list_migrations() if not status
|
||||||
if pending_migrations:
|
]:
|
||||||
stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
|
stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
|
||||||
stderr(f' {output_dir}')
|
stderr(f' {output_dir}')
|
||||||
stderr()
|
stderr()
|
||||||
|
|||||||
@@ -168,12 +168,15 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||||||
with open(ERROR_LOG, "a", encoding='utf-8') as f:
|
with open(ERROR_LOG, "a", encoding='utf-8') as f:
|
||||||
command = ' '.join(sys.argv)
|
command = ' '.join(sys.argv)
|
||||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||||
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
|
f.write(
|
||||||
method_name,
|
(
|
||||||
link.url,
|
(
|
||||||
command,
|
"\n"
|
||||||
ts
|
+ f'Exception in archive_methods.save_{method_name}(Link(url={link.url})) command={command}; ts={ts}'
|
||||||
) + "\n"))
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
)
|
||||||
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
||||||
|
|
||||||
# print(' ', stats)
|
# print(' ', stats)
|
||||||
@@ -197,7 +200,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
print(f' ! Failed to archive link: {err.__class__.__name__}: {err}')
|
||||||
raise
|
raise
|
||||||
|
|
||||||
return link
|
return link
|
||||||
|
|||||||
@@ -85,10 +85,9 @@ def merge_links(a: Link, b: Link) -> Link:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# all unique, truthy tags
|
# all unique, truthy tags
|
||||||
tags_set = (
|
tags_set = {tag.strip() for tag in (a.tags or '').split(',')} | {
|
||||||
set(tag.strip() for tag in (a.tags or '').split(','))
|
tag.strip() for tag in (b.tags or '').split(',')
|
||||||
| set(tag.strip() for tag in (b.tags or '').split(','))
|
}
|
||||||
)
|
|
||||||
tags = ','.join(tags_set) or None
|
tags = ','.join(tags_set) or None
|
||||||
|
|
||||||
# all unique source entries
|
# all unique source entries
|
||||||
@@ -199,10 +198,10 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
|
|||||||
if timestamp not in used_timestamps:
|
if timestamp not in used_timestamps:
|
||||||
return timestamp
|
return timestamp
|
||||||
|
|
||||||
new_timestamp = '{}.{}'.format(timestamp, nonce)
|
new_timestamp = f'{timestamp}.{nonce}'
|
||||||
while new_timestamp in used_timestamps:
|
while new_timestamp in used_timestamps:
|
||||||
nonce += 1
|
nonce += 1
|
||||||
new_timestamp = '{}.{}'.format(timestamp, nonce)
|
new_timestamp = f'{timestamp}.{nonce}'
|
||||||
|
|
||||||
return new_timestamp
|
return new_timestamp
|
||||||
|
|
||||||
@@ -292,8 +291,7 @@ def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) ->
|
|||||||
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
index_link = snapshots.filter(url=link.url)
|
if index_link := snapshots.filter(url=link.url):
|
||||||
if index_link:
|
|
||||||
link = merge_links(index_link[0].as_link(), link)
|
link = merge_links(index_link[0].as_link(), link)
|
||||||
|
|
||||||
unique_urls[link.url] = link
|
unique_urls[link.url] = link
|
||||||
@@ -320,7 +318,7 @@ def dedupe_links(snapshots: QuerySet,
|
|||||||
|
|
||||||
# Replace links in new_links with the dedup version
|
# Replace links in new_links with the dedup version
|
||||||
for i in range(len(new_links)):
|
for i in range(len(new_links)):
|
||||||
if new_links[i].url in dedup_links_dict.keys():
|
if new_links[i].url in dedup_links_dict:
|
||||||
new_links[i] = dedup_links_dict[new_links[i].url]
|
new_links[i] = dedup_links_dict[new_links[i].url]
|
||||||
log_deduping_finished(len(new_links))
|
log_deduping_finished(len(new_links))
|
||||||
|
|
||||||
@@ -345,8 +343,7 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
|
|||||||
"""
|
"""
|
||||||
out_dir = out_dir or link.link_dir
|
out_dir = out_dir or link.link_dir
|
||||||
|
|
||||||
existing_link = parse_json_link_details(out_dir)
|
if existing_link := parse_json_link_details(out_dir):
|
||||||
if existing_link:
|
|
||||||
return merge_links(existing_link, link)
|
return merge_links(existing_link, link)
|
||||||
|
|
||||||
return link
|
return link
|
||||||
@@ -564,9 +561,8 @@ def is_valid(link: Link) -> bool:
|
|||||||
if not dir_exists:
|
if not dir_exists:
|
||||||
# unarchived links are not included in the valid list
|
# unarchived links are not included in the valid list
|
||||||
return False
|
return False
|
||||||
if dir_exists and not index_exists:
|
if not index_exists:
|
||||||
return False
|
return False
|
||||||
if dir_exists and index_exists:
|
|
||||||
try:
|
try:
|
||||||
parsed_link = parse_json_link_details(link.link_dir, guess=True)
|
parsed_link = parse_json_link_details(link.link_dir, guess=True)
|
||||||
return link.url == parsed_link.url
|
return link.url == parsed_link.url
|
||||||
@@ -575,22 +571,13 @@ def is_valid(link: Link) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def is_corrupt(link: Link) -> bool:
|
def is_corrupt(link: Link) -> bool:
|
||||||
if not Path(link.link_dir).exists():
|
return False if not Path(link.link_dir).exists() else not is_valid(link)
|
||||||
# unarchived links are not considered corrupt
|
|
||||||
return False
|
|
||||||
|
|
||||||
if is_valid(link):
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def is_archived(link: Link) -> bool:
|
def is_archived(link: Link) -> bool:
|
||||||
return is_valid(link) and link.is_archived
|
return is_valid(link) and link.is_archived
|
||||||
|
|
||||||
def is_unarchived(link: Link) -> bool:
|
def is_unarchived(link: Link) -> bool:
|
||||||
if not Path(link.link_dir).exists():
|
return True if not Path(link.link_dir).exists() else not link.is_archived
|
||||||
return True
|
|
||||||
return not link.is_archived
|
|
||||||
|
|
||||||
|
|
||||||
def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
|
def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
|
||||||
|
|||||||
Reference in New Issue
Block a user