'Refactored by Sourcery'

This commit is contained in:
Sourcery AI
2023-08-02 14:57:45 +00:00
parent 2076474252
commit 771d5ad676
3 changed files with 293 additions and 222 deletions

View File

@@ -19,6 +19,7 @@ Documentation:
""" """
__package__ = 'archivebox' __package__ = 'archivebox'
import os import os
@@ -67,13 +68,17 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SHELL_CONFIG': { 'SHELL_CONFIG': {
'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()}, 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']}, 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now 'SHOW_PROGRESS': {
'type': bool,
'default': lambda c: (
c['IS_TTY'] and platform.system() != 'Darwin'
),
}, # progress bars are buggy on mac, disable for now
'IN_DOCKER': {'type': bool, 'default': False}, 'IN_DOCKER': {'type': bool, 'default': False},
'PUID': {'type': int, 'default': os.getuid()}, 'PUID': {'type': int, 'default': os.getuid()},
'PGID': {'type': int, 'default': os.getgid()}, 'PGID': {'type': int, 'default': os.getgid()},
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True}, # TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
}, },
'GENERAL_CONFIG': { 'GENERAL_CONFIG': {
'OUTPUT_DIR': {'type': str, 'default': None}, 'OUTPUT_DIR': {'type': str, 'default': None},
'CONFIG_FILE': {'type': str, 'default': None}, 'CONFIG_FILE': {'type': str, 'default': None},
@@ -82,21 +87,36 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, 'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'}, 'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages 'URL_DENYLIST': {
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)}, 'type': str,
'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$',
'aliases': ('URL_BLACKLIST',),
}, # to avoid downloading code assets as their own pages
'URL_ALLOWLIST': {
'type': str,
'default': None,
'aliases': ('URL_WHITELIST',),
},
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True}, 'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'}, 'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
}, },
'SERVER_CONFIG': { 'SERVER_CONFIG': {
'SECRET_KEY': {'type': str, 'default': None}, 'SECRET_KEY': {'type': str, 'default': None},
'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]}, 'BIND_ADDR': {
'type': str,
'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][
c['IN_DOCKER']
],
},
'ALLOWED_HOSTS': {'type': str, 'default': '*'}, 'ALLOWED_HOSTS': {'type': str, 'default': '*'},
'DEBUG': {'type': bool, 'default': False}, 'DEBUG': {'type': bool, 'default': False},
'PUBLIC_INDEX': {'type': bool, 'default': True}, 'PUBLIC_INDEX': {'type': bool, 'default': True},
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, 'FOOTER_INFO': {
'type': str,
'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',
},
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
'TIME_ZONE': {'type': str, 'default': 'UTC'}, 'TIME_ZONE': {'type': str, 'default': 'UTC'},
@@ -107,44 +127,114 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'PREVIEW_ORIGINALS': {'type': bool, 'default': True}, 'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
}, },
'ARCHIVE_METHOD_TOGGLES': { 'ARCHIVE_METHOD_TOGGLES': {
'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)}, 'SAVE_TITLE': {
'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)}, 'type': bool,
'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)}, 'default': True,
'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)}, 'aliases': ('FETCH_TITLE',),
'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, },
'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)}, 'SAVE_FAVICON': {
'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)}, 'type': bool,
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, 'default': True,
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, 'aliases': ('FETCH_FAVICON',),
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, },
'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)}, 'SAVE_WGET': {
'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)}, 'type': bool,
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)}, 'default': True,
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)}, 'aliases': ('FETCH_WGET',),
'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)}, },
'SAVE_ALLOWLIST': {'type': dict, 'default': {},}, 'SAVE_WGET_REQUISITES': {
'SAVE_DENYLIST': {'type': dict, 'default': {},}, 'type': bool,
'default': True,
'aliases': ('FETCH_WGET_REQUISITES',),
},
'SAVE_SINGLEFILE': {
'type': bool,
'default': True,
'aliases': ('FETCH_SINGLEFILE',),
},
'SAVE_READABILITY': {
'type': bool,
'default': True,
'aliases': ('FETCH_READABILITY',),
},
'SAVE_MERCURY': {
'type': bool,
'default': True,
'aliases': ('FETCH_MERCURY',),
},
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
'SAVE_SCREENSHOT': {
'type': bool,
'default': True,
'aliases': ('FETCH_SCREENSHOT',),
},
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
'SAVE_HEADERS': {
'type': bool,
'default': True,
'aliases': ('FETCH_HEADERS',),
},
'SAVE_WARC': {
'type': bool,
'default': True,
'aliases': ('FETCH_WARC',),
},
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
'SAVE_MEDIA': {
'type': bool,
'default': True,
'aliases': ('FETCH_MEDIA',),
},
'SAVE_ARCHIVE_DOT_ORG': {
'type': bool,
'default': True,
'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',),
},
'SAVE_ALLOWLIST': {
'type': dict,
'default': {},
},
'SAVE_DENYLIST': {
'type': dict,
'default': {},
},
}, },
'ARCHIVE_METHOD_OPTIONS': { 'ARCHIVE_METHOD_OPTIONS': {
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)}, 'RESOLUTION': {
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'}, 'type': str,
'default': '1440,2000',
'aliases': ('SCREENSHOT_RESOLUTION',),
},
'GIT_DOMAINS': {
'type': str,
'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com',
},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
'CURL_USER_AGENT': {
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'}, 'type': str,
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'}, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}',
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, },
'WGET_USER_AGENT': {
'type': str,
'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}',
},
'CHROME_USER_AGENT': {
'type': str,
'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)',
},
'COOKIES_FILE': {'type': str, 'default': None}, 'COOKIES_FILE': {'type': str, 'default': None},
'CHROME_USER_DATA_DIR': {'type': str, 'default': None}, 'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
'CHROME_TIMEOUT': {'type': int, 'default': 0}, 'CHROME_TIMEOUT': {'type': int, 'default': 0},
'CHROME_HEADLESS': {'type': bool, 'default': True}, 'CHROME_HEADLESS': {'type': bool, 'default': True},
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']}, 'CHROME_SANDBOX': {
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [ 'type': bool,
'default': lambda c: not c['IN_DOCKER'],
},
'YOUTUBEDL_ARGS': {
'type': list,
'default': lambda c: [
'--write-description', '--write-description',
'--write-info-json', '--write-info-json',
'--write-annotations', '--write-annotations',
@@ -152,43 +242,41 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--no-call-home', '--no-call-home',
'--write-sub', '--write-sub',
'--all-subs', '--all-subs',
# There are too many of these and youtube
# throttles you with HTTP error 429
#'--write-auto-subs',
'--convert-subs=srt', '--convert-subs=srt',
'--yes-playlist', '--yes-playlist',
'--continue', '--continue',
# This flag doesn't exist in youtube-dl
# only in yt-dlp
'--no-abort-on-error', '--no-abort-on-error',
# --ignore-errors must come AFTER
# --no-abort-on-error
# https://github.com/yt-dlp/yt-dlp/issues/4914
'--ignore-errors', '--ignore-errors',
'--geo-bypass', '--geo-bypass',
'--add-metadata', '--add-metadata',
'--max-filesize={}'.format(c['MEDIA_MAX_SIZE']), f"--max-filesize={c['MEDIA_MAX_SIZE']}",
]}, ],
},
'WGET_ARGS': {
'WGET_ARGS': {'type': list, 'default': ['--no-verbose', 'type': list,
'default': [
'--no-verbose',
'--adjust-extension', '--adjust-extension',
'--convert-links', '--convert-links',
'--force-directories', '--force-directories',
'--backup-converted', '--backup-converted',
'--span-hosts', '--span-hosts',
'--no-parent', '--no-parent',
'-e', 'robots=off', '-e',
]}, 'robots=off',
'CURL_ARGS': {'type': list, 'default': ['--silent', ],
'--location', },
'--compressed' 'CURL_ARGS': {
]}, 'type': list,
'default': ['--silent', '--location', '--compressed'],
},
'GIT_ARGS': {'type': list, 'default': ['--recursive']}, 'GIT_ARGS': {'type': list, 'default': ['--recursive']},
'SINGLEFILE_ARGS': {'type': list, 'default': None}, 'SINGLEFILE_ARGS': {'type': list, 'default': None},
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'}, 'FAVICON_PROVIDER': {
'type': str,
'default': 'https://www.google.com/s2/favicons?domain={}',
},
}, },
'SEARCH_BACKEND_CONFIG': { 'SEARCH_BACKEND_CONFIG': {
'USE_INDEXING_BACKEND': {'type': bool, 'default': True}, 'USE_INDEXING_BACKEND': {'type': bool, 'default': True},
'USE_SEARCHING_BACKEND': {'type': bool, 'default': True}, 'USE_SEARCHING_BACKEND': {'type': bool, 'default': True},
@@ -201,7 +289,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'},
'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90}, 'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90},
}, },
'DEPENDENCY_CONFIG': { 'DEPENDENCY_CONFIG': {
'USE_CURL': {'type': bool, 'default': True}, 'USE_CURL': {'type': bool, 'default': True},
'USE_WGET': {'type': bool, 'default': True}, 'USE_WGET': {'type': bool, 'default': True},
@@ -213,19 +300,26 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'USE_NODE': {'type': bool, 'default': True}, 'USE_NODE': {'type': bool, 'default': True},
'USE_YOUTUBEDL': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True},
'USE_RIPGREP': {'type': bool, 'default': True}, 'USE_RIPGREP': {'type': bool, 'default': True},
'CURL_BINARY': {'type': str, 'default': 'curl'}, 'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'}, 'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'}, 'WGET_BINARY': {'type': str, 'default': 'wget'},
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, 'SINGLEFILE_BINARY': {
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')}, 'type': str,
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')}, 'default': lambda c: bin_path('single-file'),
},
'READABILITY_BINARY': {
'type': str,
'default': lambda c: bin_path('readability-extractor'),
},
'MERCURY_BINARY': {
'type': str,
'default': lambda c: bin_path('mercury-parser'),
},
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, #'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
'NODE_BINARY': {'type': str, 'default': 'node'}, 'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None}, 'CHROME_BINARY': {'type': str, 'default': None},
'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, 'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
}, },
@@ -276,7 +370,7 @@ DEFAULT_CLI_COLORS = {
'white': '\033[01;37m', 'white': '\033[01;37m',
'black': '\033[01;30m', 'black': '\033[01;30m',
} }
ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()} ANSI = {k: '' for k in DEFAULT_CLI_COLORS}
COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], { COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
'00': [(0, 0, 0), (0, 0, 0)], '00': [(0, 0, 0), (0, 0, 0)],
@@ -519,15 +613,11 @@ def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]:
config_file = ConfigParser() config_file = ConfigParser()
config_file.optionxform = str config_file.optionxform = str
config_file.read(config_path) config_file.read(config_path)
# flatten into one namespace return {
config_file_vars = {
key.upper(): val key.upper(): val
for section, options in config_file.items() for section, options in config_file.items()
for key, val in options.items() for key, val in options.items()
} }
# print('[i] Loaded config file', os.path.abspath(config_path))
# print(config_file_vars)
return config_file_vars
return None return None
@@ -536,6 +626,10 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
from .system import atomic_write from .system import atomic_write
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
config_path = Path(out_dir) / CONFIG_FILENAME
if not config_path.exists():
CONFIG_HEADER = ( CONFIG_HEADER = (
"""# This is the config file for your ArchiveBox collection. """# This is the config file for your ArchiveBox collection.
# #
@@ -550,10 +644,6 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
""") """)
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
config_path = Path(out_dir) / CONFIG_FILENAME
if not config_path.exists():
atomic_write(config_path, CONFIG_HEADER) atomic_write(config_path, CONFIG_HEADER)
config_file = ConfigParser() config_file = ConfigParser()
@@ -568,10 +658,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
# Set up sections in empty config file # Set up sections in empty config file
for key, val in config.items(): for key, val in config.items():
section = find_section(key) section = find_section(key)
if section in config_file: existing_config = dict(config_file[section]) if section in config_file else {}
existing_config = dict(config_file[section])
else:
existing_config = {}
config_file[section] = {**existing_config, key: val} config_file[section] = {**existing_config, key: val}
# always make sure there's a SECRET_KEY defined for Django # always make sure there's a SECRET_KEY defined for Django
@@ -604,10 +691,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
if Path(f'{config_path}.bak').exists(): if Path(f'{config_path}.bak').exists():
os.remove(f'{config_path}.bak') os.remove(f'{config_path}.bak')
return { return {key.upper(): CONFIG.get(key.upper()) for key in config}
key.upper(): CONFIG.get(key.upper())
for key in config.keys()
}
@@ -637,7 +721,7 @@ def load_config(defaults: ConfigDefaultDict,
except Exception as e: except Exception as e:
stderr() stderr()
stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config) stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
stderr(' {}: {}'.format(e.__class__.__name__, e)) stderr(f' {e.__class__.__name__}: {e}')
stderr() stderr()
stderr(' Check your config for mistakes and try again (your archive data is unaffected).') stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
stderr() stderr()
@@ -683,7 +767,7 @@ def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Op
else: else:
stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi)) stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
for line in text[1:]: for line in text[1:]:
stderr('{} {}'.format(prefix, line)) stderr(f'{prefix} {line}')
# Dependency Metadata Helpers # Dependency Metadata Helpers
@@ -754,8 +838,7 @@ def find_chrome_binary() -> Optional[str]:
'google-chrome-dev', 'google-chrome-dev',
) )
for name in default_executable_paths: for name in default_executable_paths:
full_path_exists = shutil.which(name) if full_path_exists := shutil.which(name):
if full_path_exists:
return name return name
return None return None
@@ -793,7 +876,7 @@ def wget_supports_compression(config):
"--help", "--help",
] ]
return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
except (FileNotFoundError, OSError): except OSError:
return False return False
def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
@@ -1104,7 +1187,9 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
if '/Default' in str(config['CHROME_USER_DATA_DIR']): if '/Default' in str(config['CHROME_USER_DATA_DIR']):
stderr() stderr()
stderr(' Try removing /Default from the end e.g.:') stderr(' Try removing /Default from the end e.g.:')
stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0])) stderr(
f""" CHROME_USER_DATA_DIR="{config['CHROME_USER_DATA_DIR'].split('/Default')[0]}\""""
)
raise SystemExit(2) raise SystemExit(2)
@@ -1117,11 +1202,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow') stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
for dependency, info in invalid_dependencies: for dependency, info in invalid_dependencies:
stderr( stderr(
' ! {}: {} ({})'.format( f" ! {dependency}: {info['path'] or 'unable to find binary'} ({info['version'] or 'unable to detect version'})"
dependency,
info['path'] or 'unable to find binary',
info['version'] or 'unable to detect version',
)
) )
if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'): if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
hint(('To install all packages automatically run: archivebox setup', hint(('To install all packages automatically run: archivebox setup',
@@ -1178,9 +1259,9 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO
output_dir = out_dir or config['OUTPUT_DIR'] output_dir = out_dir or config['OUTPUT_DIR']
from .index.sql import list_migrations from .index.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status] if pending_migrations := [
name for status, name in list_migrations() if not status
if pending_migrations: ]:
stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow') stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
stderr(f' {output_dir}') stderr(f' {output_dir}')
stderr() stderr()

View File

@@ -168,12 +168,15 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
with open(ERROR_LOG, "a", encoding='utf-8') as f: with open(ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv) command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format( f.write(
method_name, (
link.url, (
command, "\n"
ts + f'Exception in archive_methods.save_{method_name}(Link(url={link.url})) command={command}; ts={ts}'
) + "\n")) )
+ "\n"
)
)
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
# print(' ', stats) # print(' ', stats)
@@ -197,7 +200,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
raise raise
except Exception as err: except Exception as err:
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) print(f' ! Failed to archive link: {err.__class__.__name__}: {err}')
raise raise
return link return link

View File

@@ -85,10 +85,9 @@ def merge_links(a: Link, b: Link) -> Link:
) )
# all unique, truthy tags # all unique, truthy tags
tags_set = ( tags_set = {tag.strip() for tag in (a.tags or '').split(',')} | {
set(tag.strip() for tag in (a.tags or '').split(',')) tag.strip() for tag in (b.tags or '').split(',')
| set(tag.strip() for tag in (b.tags or '').split(',')) }
)
tags = ','.join(tags_set) or None tags = ','.join(tags_set) or None
# all unique source entries # all unique source entries
@@ -199,10 +198,10 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
if timestamp not in used_timestamps: if timestamp not in used_timestamps:
return timestamp return timestamp
new_timestamp = '{}.{}'.format(timestamp, nonce) new_timestamp = f'{timestamp}.{nonce}'
while new_timestamp in used_timestamps: while new_timestamp in used_timestamps:
nonce += 1 nonce += 1
new_timestamp = '{}.{}'.format(timestamp, nonce) new_timestamp = f'{timestamp}.{nonce}'
return new_timestamp return new_timestamp
@@ -292,8 +291,7 @@ def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) ->
unique_urls: OrderedDict[str, Link] = OrderedDict() unique_urls: OrderedDict[str, Link] = OrderedDict()
for link in links: for link in links:
index_link = snapshots.filter(url=link.url) if index_link := snapshots.filter(url=link.url):
if index_link:
link = merge_links(index_link[0].as_link(), link) link = merge_links(index_link[0].as_link(), link)
unique_urls[link.url] = link unique_urls[link.url] = link
@@ -320,7 +318,7 @@ def dedupe_links(snapshots: QuerySet,
# Replace links in new_links with the dedup version # Replace links in new_links with the dedup version
for i in range(len(new_links)): for i in range(len(new_links)):
if new_links[i].url in dedup_links_dict.keys(): if new_links[i].url in dedup_links_dict:
new_links[i] = dedup_links_dict[new_links[i].url] new_links[i] = dedup_links_dict[new_links[i].url]
log_deduping_finished(len(new_links)) log_deduping_finished(len(new_links))
@@ -345,8 +343,7 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
""" """
out_dir = out_dir or link.link_dir out_dir = out_dir or link.link_dir
existing_link = parse_json_link_details(out_dir) if existing_link := parse_json_link_details(out_dir):
if existing_link:
return merge_links(existing_link, link) return merge_links(existing_link, link)
return link return link
@@ -564,9 +561,8 @@ def is_valid(link: Link) -> bool:
if not dir_exists: if not dir_exists:
# unarchived links are not included in the valid list # unarchived links are not included in the valid list
return False return False
if dir_exists and not index_exists: if not index_exists:
return False return False
if dir_exists and index_exists:
try: try:
parsed_link = parse_json_link_details(link.link_dir, guess=True) parsed_link = parse_json_link_details(link.link_dir, guess=True)
return link.url == parsed_link.url return link.url == parsed_link.url
@@ -575,22 +571,13 @@ def is_valid(link: Link) -> bool:
return False return False
def is_corrupt(link: Link) -> bool: def is_corrupt(link: Link) -> bool:
if not Path(link.link_dir).exists(): return False if not Path(link.link_dir).exists() else not is_valid(link)
# unarchived links are not considered corrupt
return False
if is_valid(link):
return False
return True
def is_archived(link: Link) -> bool: def is_archived(link: Link) -> bool:
return is_valid(link) and link.is_archived return is_valid(link) and link.is_archived
def is_unarchived(link: Link) -> bool: def is_unarchived(link: Link) -> bool:
if not Path(link.link_dir).exists(): return True if not Path(link.link_dir).exists() else not link.is_archived
return True
return not link.is_archived
def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]: def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]: