From 771d5ad676f98b75de3c05efb4860f64b96876dd Mon Sep 17 00:00:00 2001
From: Sourcery AI <>
Date: Wed, 2 Aug 2023 14:57:45 +0000
Subject: [PATCH] 'Refactored by Sourcery'

---
 archivebox/config.py              | 449 ++++++++++++++++++------------
 archivebox/extractors/__init__.py |  21 +-
 archivebox/index/__init__.py      |  45 ++-
 3 files changed, 293 insertions(+), 222 deletions(-)

diff --git a/archivebox/config.py b/archivebox/config.py
index 7334b169..87ae38c4 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -19,6 +19,7 @@ Documentation:
 
 """
 
+
 __package__ = 'archivebox'
 
 import os
@@ -65,169 +66,262 @@ except ModuleNotFoundError:
 
 CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
     'SHELL_CONFIG': {
-        'IS_TTY':                   {'type': bool,  'default': lambda _: sys.stdout.isatty()},
-        'USE_COLOR':                {'type': bool,  'default': lambda c: c['IS_TTY']},
-        'SHOW_PROGRESS':            {'type': bool,  'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')},  # progress bars are buggy on mac, disable for now
-        'IN_DOCKER':                {'type': bool,  'default': False},
-        'PUID':                     {'type': int,   'default': os.getuid()},
-        'PGID':                     {'type': int,   'default': os.getgid()},
+        'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
+        'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
+        'SHOW_PROGRESS': {
+            'type': bool,
+            'default': lambda c: (
+                c['IS_TTY'] and platform.system() != 'Darwin'
+            ),
+        },  # progress bars are buggy on mac, disable for now
+        'IN_DOCKER': {'type': bool, 'default': False},
+        'PUID': {'type': int, 'default': os.getuid()},
+        'PGID': {'type': int, 'default': os.getgid()},
         # TODO: 'SHOW_HINTS':       {'type:  bool,  'default': True},
     },
-
     'GENERAL_CONFIG': {
-        'OUTPUT_DIR':               {'type': str,   'default': None},
-        'CONFIG_FILE':              {'type': str,   'default': None},
-        'ONLY_NEW':                 {'type': bool,  'default': True},
-        'TIMEOUT':                  {'type': int,   'default': 60},
-        'MEDIA_TIMEOUT':            {'type': int,   'default': 3600},
-        'OUTPUT_PERMISSIONS':       {'type': str,   'default': '644'},
-        'RESTRICT_FILE_NAMES':      {'type': str,   'default': 'windows'},
-        'URL_DENYLIST':             {'type': str,   'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)},  # to avoid downloading code assets as their own pages
-        'URL_ALLOWLIST':            {'type': str,   'default': None, 'aliases': ('URL_WHITELIST',)},
-        'ENFORCE_ATOMIC_WRITES':    {'type': bool,  'default': True},
-        'TAG_SEPARATOR_PATTERN':    {'type': str,   'default': r'[,]'},
+        'OUTPUT_DIR': {'type': str, 'default': None},
+        'CONFIG_FILE': {'type': str, 'default': None},
+        'ONLY_NEW': {'type': bool, 'default': True},
+        'TIMEOUT': {'type': int, 'default': 60},
+        'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
+        'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
+        'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
+        'URL_DENYLIST': {
+            'type': str,
+            'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$',
+            'aliases': ('URL_BLACKLIST',),
+        },  # to avoid downloading code assets as their own pages
+        'URL_ALLOWLIST': {
+            'type': str,
+            'default': None,
+            'aliases': ('URL_WHITELIST',),
+        },
+        'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
+        'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
     },
-
     'SERVER_CONFIG': {
-        'SECRET_KEY':                {'type': str,   'default': None},
-        'BIND_ADDR':                 {'type': str,   'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
-        'ALLOWED_HOSTS':             {'type': str,   'default': '*'},
-        'DEBUG':                     {'type': bool,  'default': False},
-        'PUBLIC_INDEX':              {'type': bool,  'default': True},
-        'PUBLIC_SNAPSHOTS':          {'type': bool,  'default': True},
-        'PUBLIC_ADD_VIEW':           {'type': bool,  'default': False},
-        'FOOTER_INFO':               {'type': str,   'default': 'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.'},
-        'SNAPSHOTS_PER_PAGE':        {'type': int,   'default': 40},
-        'CUSTOM_TEMPLATES_DIR':      {'type': str,   'default': None},
-        'TIME_ZONE':                 {'type': str,   'default': 'UTC'},
-        'TIMEZONE':                 {'type': str,   'default': 'UTC'},
-        'REVERSE_PROXY_USER_HEADER': {'type': str,   'default': 'Remote-User'},
-        'REVERSE_PROXY_WHITELIST':   {'type': str,   'default': ''},
-        'LOGOUT_REDIRECT_URL':       {'type': str,   'default': '/'},
-        'PREVIEW_ORIGINALS':        {'type': bool,  'default': True},
-        'LOGOUT_REDIRECT_URL':   {'type': str,   'default': '/'},
+        'SECRET_KEY': {'type': str, 'default': None},
+        'BIND_ADDR': {
+            'type': str,
+            'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][
+                c['IN_DOCKER']
+            ],
+        },
+        'ALLOWED_HOSTS': {'type': str, 'default': '*'},
+        'DEBUG': {'type': bool, 'default': False},
+        'PUBLIC_INDEX': {'type': bool, 'default': True},
+        'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
+        'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
+        'FOOTER_INFO': {
+            'type': str,
+            'default': 'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.',
+        },
+        'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
+        'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
+        'TIME_ZONE': {'type': str, 'default': 'UTC'},
+        'TIMEZONE': {'type': str, 'default': 'UTC'},
+        'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'},
+        'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''},
+        'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
+        'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
+        'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
     },
-
     'ARCHIVE_METHOD_TOGGLES': {
-        'SAVE_TITLE':               {'type': bool,  'default': True, 'aliases': ('FETCH_TITLE',)},
-        'SAVE_FAVICON':             {'type': bool,  'default': True, 'aliases': ('FETCH_FAVICON',)},
-        'SAVE_WGET':                {'type': bool,  'default': True, 'aliases': ('FETCH_WGET',)},
-        'SAVE_WGET_REQUISITES':     {'type': bool,  'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
-        'SAVE_SINGLEFILE':          {'type': bool,  'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
-        'SAVE_READABILITY':         {'type': bool,  'default': True, 'aliases': ('FETCH_READABILITY',)},
-        'SAVE_MERCURY':             {'type': bool,  'default': True, 'aliases': ('FETCH_MERCURY',)},
-        'SAVE_PDF':                 {'type': bool,  'default': True, 'aliases': ('FETCH_PDF',)},
-        'SAVE_SCREENSHOT':          {'type': bool,  'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
-        'SAVE_DOM':                 {'type': bool,  'default': True, 'aliases': ('FETCH_DOM',)},
-        'SAVE_HEADERS':             {'type': bool,  'default': True, 'aliases': ('FETCH_HEADERS',)},
-        'SAVE_WARC':                {'type': bool,  'default': True, 'aliases': ('FETCH_WARC',)},
-        'SAVE_GIT':                 {'type': bool,  'default': True, 'aliases': ('FETCH_GIT',)},
-        'SAVE_MEDIA':               {'type': bool,  'default': True, 'aliases': ('FETCH_MEDIA',)},
-        'SAVE_ARCHIVE_DOT_ORG':     {'type': bool,  'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
-        'SAVE_ALLOWLIST':           {'type': dict,  'default': {},},
-        'SAVE_DENYLIST':            {'type': dict,  'default': {},},
+        'SAVE_TITLE': {
+            'type': bool,
+            'default': True,
+            'aliases': ('FETCH_TITLE',),
+        },
+        'SAVE_FAVICON': {
+            'type': bool,
+            'default': True,
+            'aliases': ('FETCH_FAVICON',),
+        },
+        'SAVE_WGET': {
+            'type': bool,
+            'default': True,
+            'aliases': ('FETCH_WGET',),
+        },
+        'SAVE_WGET_REQUISITES': {
+            'type': bool,
+            'default': True,
+            'aliases': ('FETCH_WGET_REQUISITES',),
+        },
+        'SAVE_SINGLEFILE': {
+            'type': bool,
+            'default': True,
+            'aliases': ('FETCH_SINGLEFILE',),
+        },
+        'SAVE_READABILITY': {
+            'type': bool,
+            'default': True,
+            'aliases': ('FETCH_READABILITY',),
+        },
+        'SAVE_MERCURY': {
+            'type': bool,
+            'default': True,
+            'aliases': ('FETCH_MERCURY',),
+        },
+        'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
+        'SAVE_SCREENSHOT': {
+            'type': bool,
+            'default': True,
+            'aliases': ('FETCH_SCREENSHOT',),
+        },
+        'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
+        'SAVE_HEADERS': {
+            'type': bool,
+            'default': True,
+            'aliases': ('FETCH_HEADERS',),
+        },
+        'SAVE_WARC': {
+            'type': bool,
+            'default': True,
+            'aliases': ('FETCH_WARC',),
+        },
+        'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
+        'SAVE_MEDIA': {
+            'type': bool,
+            'default': True,
+            'aliases': ('FETCH_MEDIA',),
+        },
+        'SAVE_ARCHIVE_DOT_ORG': {
+            'type': bool,
+            'default': True,
+            'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',),
+        },
+        'SAVE_ALLOWLIST': {
+            'type': dict,
+            'default': {},
+        },
+        'SAVE_DENYLIST': {
+            'type': dict,
+            'default': {},
+        },
     },
-
     'ARCHIVE_METHOD_OPTIONS': {
-        'RESOLUTION':               {'type': str,   'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
-        'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
-        'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
-        'MEDIA_MAX_SIZE':           {'type': str,   'default': '750m'},
-
-        'CURL_USER_AGENT':          {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
-        'WGET_USER_AGENT':          {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
-        'CHROME_USER_AGENT':        {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
-
-        'COOKIES_FILE':             {'type': str,   'default': None},
-        'CHROME_USER_DATA_DIR':     {'type': str,   'default': None},
-
-        'CHROME_TIMEOUT':           {'type': int,   'default': 0},
-        'CHROME_HEADLESS':          {'type': bool,  'default': True},
-        'CHROME_SANDBOX':           {'type': bool,  'default': lambda c: not c['IN_DOCKER']},
-        'YOUTUBEDL_ARGS':           {'type': list,  'default': lambda c: [
-                                                                '--write-description',
-                                                                '--write-info-json',
-                                                                '--write-annotations',
-                                                                '--write-thumbnail',
-                                                                '--no-call-home',
-                                                                '--write-sub',
-                                                                '--all-subs',
-                                                                # There are too many of these and youtube
-                                                                # throttles you with HTTP error 429
-                                                                #'--write-auto-subs',
-                                                                '--convert-subs=srt',
-                                                                '--yes-playlist',
-                                                                '--continue',
-                                                                # This flag doesn't exist in youtube-dl
-                                                                # only in yt-dlp
-                                                                '--no-abort-on-error',
-                                                                # --ignore-errors must come AFTER
-                                                                # --no-abort-on-error
-                                                                # https://github.com/yt-dlp/yt-dlp/issues/4914
-                                                                '--ignore-errors',
-                                                                '--geo-bypass',
-                                                                '--add-metadata',
-                                                                '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
-                                                                ]},
-
-
-        'WGET_ARGS':                {'type': list,  'default': ['--no-verbose',
-                                                                '--adjust-extension',
-                                                                '--convert-links',
-                                                                '--force-directories',
-                                                                '--backup-converted',
-                                                                '--span-hosts',
-                                                                '--no-parent',
-                                                                '-e', 'robots=off',
-                                                                ]},
-        'CURL_ARGS':                {'type': list,  'default': ['--silent',
-                                                                '--location',
-                                                                '--compressed'
-                                                               ]},
-        'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
-        'SINGLEFILE_ARGS':          {'type': list,  'default' : None},
-        'FAVICON_PROVIDER':         {'type': str,   'default': 'https://www.google.com/s2/favicons?domain={}'},
+        'RESOLUTION': {
+            'type': str,
+            'default': '1440,2000',
+            'aliases': ('SCREENSHOT_RESOLUTION',),
+        },
+        'GIT_DOMAINS': {
+            'type': str,
+            'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com',
+        },
+        'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
+        'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
+        'CURL_USER_AGENT': {
+            'type': str,
+            'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}',
+        },
+        'WGET_USER_AGENT': {
+            'type': str,
+            'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}',
+        },
+        'CHROME_USER_AGENT': {
+            'type': str,
+            'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)',
+        },
+        'COOKIES_FILE': {'type': str, 'default': None},
+        'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
+        'CHROME_TIMEOUT': {'type': int, 'default': 0},
+        'CHROME_HEADLESS': {'type': bool, 'default': True},
+        'CHROME_SANDBOX': {
+            'type': bool,
+            'default': lambda c: not c['IN_DOCKER'],
+        },
+        'YOUTUBEDL_ARGS': {
+            'type': list,
+            'default': lambda c: [
+                '--write-description',
+                '--write-info-json',
+                '--write-annotations',
+                '--write-thumbnail',
+                '--no-call-home',
+                '--write-sub',
+                '--all-subs',
+                '--convert-subs=srt',
+                '--yes-playlist',
+                '--continue',
+                '--no-abort-on-error',
+                '--ignore-errors',
+                '--geo-bypass',
+                '--add-metadata',
+                f"--max-filesize={c['MEDIA_MAX_SIZE']}",
+            ],
+        },
+        'WGET_ARGS': {
+            'type': list,
+            'default': [
+                '--no-verbose',
+                '--adjust-extension',
+                '--convert-links',
+                '--force-directories',
+                '--backup-converted',
+                '--span-hosts',
+                '--no-parent',
+                '-e',
+                'robots=off',
+            ],
+        },
+        'CURL_ARGS': {
+            'type': list,
+            'default': ['--silent', '--location', '--compressed'],
+        },
+        'GIT_ARGS': {'type': list, 'default': ['--recursive']},
+        'SINGLEFILE_ARGS': {'type': list, 'default': None},
+        'FAVICON_PROVIDER': {
+            'type': str,
+            'default': 'https://www.google.com/s2/favicons?domain={}',
+        },
     },
-
-    'SEARCH_BACKEND_CONFIG' : {
-        'USE_INDEXING_BACKEND':     {'type': bool,  'default': True},
-        'USE_SEARCHING_BACKEND':    {'type': bool,  'default': True},
-        'SEARCH_BACKEND_ENGINE':    {'type': str,   'default': 'ripgrep'},
-        'SEARCH_BACKEND_HOST_NAME': {'type': str,   'default': 'localhost'},
-        'SEARCH_BACKEND_PORT':      {'type': int,   'default': 1491},
-        'SEARCH_BACKEND_PASSWORD':  {'type': str,   'default': 'SecretPassword'},
+    'SEARCH_BACKEND_CONFIG': {
+        'USE_INDEXING_BACKEND': {'type': bool, 'default': True},
+        'USE_SEARCHING_BACKEND': {'type': bool, 'default': True},
+        'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'},
+        'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'},
+        'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491},
+        'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'},
         # SONIC
-        'SONIC_COLLECTION':         {'type': str,   'default': 'archivebox'},
-        'SONIC_BUCKET':             {'type': str,   'default': 'snapshots'},
-        'SEARCH_BACKEND_TIMEOUT':   {'type': int,   'default': 90},
+        'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'},
+        'SONIC_BUCKET': {'type': str, 'default': 'snapshots'},
+        'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90},
     },
-
     'DEPENDENCY_CONFIG': {
-        'USE_CURL':                 {'type': bool,  'default': True},
-        'USE_WGET':                 {'type': bool,  'default': True},
-        'USE_SINGLEFILE':           {'type': bool,  'default': True},
-        'USE_READABILITY':          {'type': bool,  'default': True},
-        'USE_MERCURY':              {'type': bool,  'default': True},
-        'USE_GIT':                  {'type': bool,  'default': True},
-        'USE_CHROME':               {'type': bool,  'default': True},
-        'USE_NODE':                 {'type': bool,  'default': True},
-        'USE_YOUTUBEDL':            {'type': bool,  'default': True},
-        'USE_RIPGREP':              {'type': bool,  'default': True},
-
-        'CURL_BINARY':              {'type': str,   'default': 'curl'},
-        'GIT_BINARY':               {'type': str,   'default': 'git'},
-        'WGET_BINARY':              {'type': str,   'default': 'wget'},
-        'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
-        'READABILITY_BINARY':       {'type': str,   'default': lambda c: bin_path('readability-extractor')},
-        'MERCURY_BINARY':           {'type': str,   'default': lambda c: bin_path('mercury-parser')},
+        'USE_CURL': {'type': bool, 'default': True},
+        'USE_WGET': {'type': bool, 'default': True},
+        'USE_SINGLEFILE': {'type': bool, 'default': True},
+        'USE_READABILITY': {'type': bool, 'default': True},
+        'USE_MERCURY': {'type': bool, 'default': True},
+        'USE_GIT': {'type': bool, 'default': True},
+        'USE_CHROME': {'type': bool, 'default': True},
+        'USE_NODE': {'type': bool, 'default': True},
+        'USE_YOUTUBEDL': {'type': bool, 'default': True},
+        'USE_RIPGREP': {'type': bool, 'default': True},
+        'CURL_BINARY': {'type': str, 'default': 'curl'},
+        'GIT_BINARY': {'type': str, 'default': 'git'},
+        'WGET_BINARY': {'type': str, 'default': 'wget'},
+        'SINGLEFILE_BINARY': {
+            'type': str,
+            'default': lambda c: bin_path('single-file'),
+        },
+        'READABILITY_BINARY': {
+            'type': str,
+            'default': lambda c: bin_path('readability-extractor'),
+        },
+        'MERCURY_BINARY': {
+            'type': str,
+            'default': lambda c: bin_path('mercury-parser'),
+        },
         #'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
-        'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},
-        'NODE_BINARY':              {'type': str,   'default': 'node'},
-        'RIPGREP_BINARY':           {'type': str,   'default': 'rg'},
-        'CHROME_BINARY':            {'type': str,   'default': None},
-
-        'POCKET_CONSUMER_KEY':      {'type': str,   'default': None},
-        'POCKET_ACCESS_TOKENS':     {'type': dict,  'default': {}},
+        'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
+        'NODE_BINARY': {'type': str, 'default': 'node'},
+        'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
+        'CHROME_BINARY': {'type': str, 'default': None},
+        'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
+        'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
     },
 }
 
@@ -276,7 +370,7 @@ DEFAULT_CLI_COLORS = {
     'white': '\033[01;37m',
     'black': '\033[01;30m',
 }
-ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
+ANSI = {k: '' for k in DEFAULT_CLI_COLORS}
 
 COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
     '00': [(0, 0, 0), (0, 0, 0)],
@@ -519,15 +613,11 @@ def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]:
         config_file = ConfigParser()
         config_file.optionxform = str
         config_file.read(config_path)
-        # flatten into one namespace
-        config_file_vars = {
+        return {
             key.upper(): val
             for section, options in config_file.items()
-                for key, val in options.items()
+            for key, val in options.items()
         }
-        # print('[i] Loaded config file', os.path.abspath(config_path))
-        # print(config_file_vars)
-        return config_file_vars
     return None
 
 
@@ -536,8 +626,12 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
 
     from .system import atomic_write
 
-    CONFIG_HEADER = (
-    """# This is the config file for your ArchiveBox collection.
+    out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
+    config_path = Path(out_dir) /  CONFIG_FILENAME
+
+    if not config_path.exists():
+        CONFIG_HEADER = (
+        """# This is the config file for your ArchiveBox collection.
     #
     # You can add options here manually in INI format, or automatically by running:
     #    archivebox config --set KEY=VALUE
@@ -550,10 +644,6 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
 
     """)
 
-    out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
-    config_path = Path(out_dir) /  CONFIG_FILENAME
-
-    if not config_path.exists():
         atomic_write(config_path, CONFIG_HEADER)
 
     config_file = ConfigParser()
@@ -568,10 +658,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
     # Set up sections in empty config file
     for key, val in config.items():
         section = find_section(key)
-        if section in config_file:
-            existing_config = dict(config_file[section])
-        else:
-            existing_config = {}
+        existing_config = dict(config_file[section]) if section in config_file else {}
         config_file[section] = {**existing_config, key: val}
 
     # always make sure there's a SECRET_KEY defined for Django
@@ -604,10 +691,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
     if Path(f'{config_path}.bak').exists():
         os.remove(f'{config_path}.bak')
 
-    return {
-        key.upper(): CONFIG.get(key.upper())
-        for key in config.keys()
-    }
+    return {key.upper(): CONFIG.get(key.upper()) for key in config}
 
 
 
@@ -637,7 +721,7 @@ def load_config(defaults: ConfigDefaultDict,
         except Exception as e:
             stderr()
             stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
-            stderr('    {}: {}'.format(e.__class__.__name__, e))
+            stderr(f'    {e.__class__.__name__}: {e}')
             stderr()
             stderr('    Check your config for mistakes and try again (your archive data is unaffected).')
             stderr()
@@ -683,7 +767,7 @@ def hint(text: Union[Tuple[str, ...], List[str], str], prefix='    ', config: Op
     else:
         stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
         for line in text[1:]:
-            stderr('{}      {}'.format(prefix, line))
+            stderr(f'{prefix}      {line}')
 
 
 # Dependency Metadata Helpers
@@ -754,8 +838,7 @@ def find_chrome_binary() -> Optional[str]:
         'google-chrome-dev',
     )
     for name in default_executable_paths:
-        full_path_exists = shutil.which(name)
-        if full_path_exists:
+        if full_path_exists := shutil.which(name):
             return name
 
     return None
@@ -793,7 +876,7 @@ def wget_supports_compression(config):
             "--help",
         ]
         return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
-    except (FileNotFoundError, OSError):
+    except OSError:
         return False
 
 def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
@@ -1104,7 +1187,9 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
             if '/Default' in str(config['CHROME_USER_DATA_DIR']):
                 stderr()
                 stderr('    Try removing /Default from the end e.g.:')
-                stderr('        CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
+                stderr(
+                    f"""        CHROME_USER_DATA_DIR="{config['CHROME_USER_DATA_DIR'].split('/Default')[0]}\""""
+                )
             raise SystemExit(2)
 
 
@@ -1117,11 +1202,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
         stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
         for dependency, info in invalid_dependencies:
             stderr(
-                '    ! {}: {} ({})'.format(
-                    dependency,
-                    info['path'] or 'unable to find binary',
-                    info['version'] or 'unable to detect version',
-                )
+                f"    ! {dependency}: {info['path'] or 'unable to find binary'} ({info['version'] or 'unable to detect version'})"
             )
             if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
                 hint(('To install all packages automatically run: archivebox setup',
@@ -1178,9 +1259,9 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO
     output_dir = out_dir or config['OUTPUT_DIR']
     from .index.sql import list_migrations
 
-    pending_migrations = [name for status, name in list_migrations() if not status]
-
-    if pending_migrations:
+    if pending_migrations := [
+        name for status, name in list_migrations() if not status
+    ]:
         stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
         stderr(f'    {output_dir}')
         stderr()
diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index 38710182..f43263b4 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -105,7 +105,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
         snapshot = write_link_to_sql_index(link)
 
     active_methods = get_archive_methods_for_link(link)
-    
+
     if methods:
         active_methods = [
             method for method in active_methods
@@ -168,13 +168,16 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                 with open(ERROR_LOG, "a", encoding='utf-8') as f:
                     command = ' '.join(sys.argv)
                     ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
-                    f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
-                        method_name,
-                        link.url,
-                        command,
-                        ts
-                    ) + "\n"))
-                    #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
+                    f.write(
+                        (
+                            (
+                                "\n"
+                                + f'Exception in archive_methods.save_{method_name}(Link(url={link.url})) command={command}; ts={ts}'
+                            )
+                            + "\n"
+                        )
+                    )
+                                    #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
 
         # print('    ', stats)
 
@@ -197,7 +200,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
         raise
 
     except Exception as err:
-        print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
+        print(f'    ! Failed to archive link: {err.__class__.__name__}: {err}')
         raise
 
     return link
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index b9d57aeb..a8390c0e 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -85,10 +85,9 @@ def merge_links(a: Link, b: Link) -> Link:
     )
 
     # all unique, truthy tags
-    tags_set = (
-        set(tag.strip() for tag in (a.tags or '').split(','))
-        | set(tag.strip() for tag in (b.tags or '').split(','))
-    )
+    tags_set = {tag.strip() for tag in (a.tags or '').split(',')} | {
+        tag.strip() for tag in (b.tags or '').split(',')
+    }
     tags = ','.join(tags_set) or None
 
     # all unique source entries
@@ -199,10 +198,10 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
     if timestamp not in used_timestamps:
         return timestamp
 
-    new_timestamp = '{}.{}'.format(timestamp, nonce)
+    new_timestamp = f'{timestamp}.{nonce}'
     while new_timestamp in used_timestamps:
         nonce += 1
-        new_timestamp = '{}.{}'.format(timestamp, nonce)
+        new_timestamp = f'{timestamp}.{nonce}'
 
     return new_timestamp
 
@@ -292,8 +291,7 @@ def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) ->
     unique_urls: OrderedDict[str, Link] = OrderedDict()
 
     for link in links:
-        index_link = snapshots.filter(url=link.url)
-        if index_link:
+        if index_link := snapshots.filter(url=link.url):
             link = merge_links(index_link[0].as_link(), link)
 
         unique_urls[link.url] = link
@@ -320,7 +318,7 @@ def dedupe_links(snapshots: QuerySet,
 
     # Replace links in new_links with the dedup version
     for i in range(len(new_links)):
-        if new_links[i].url in dedup_links_dict.keys():
+        if new_links[i].url in dedup_links_dict:
             new_links[i] = dedup_links_dict[new_links[i].url]
     log_deduping_finished(len(new_links))
 
@@ -345,8 +343,7 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
     """
     out_dir = out_dir or link.link_dir
 
-    existing_link = parse_json_link_details(out_dir)
-    if existing_link:
+    if existing_link := parse_json_link_details(out_dir):
         return merge_links(existing_link, link)
 
     return link
@@ -564,33 +561,23 @@ def is_valid(link: Link) -> bool:
     if not dir_exists:
         # unarchived links are not included in the valid list
         return False
-    if dir_exists and not index_exists:
+    if not index_exists:
         return False
-    if dir_exists and index_exists:
-        try:
-            parsed_link = parse_json_link_details(link.link_dir, guess=True)
-            return link.url == parsed_link.url
-        except Exception:
-            pass
+    try:
+        parsed_link = parse_json_link_details(link.link_dir, guess=True)
+        return link.url == parsed_link.url
+    except Exception:
+        pass
     return False
 
 def is_corrupt(link: Link) -> bool:
-    if not Path(link.link_dir).exists():
-        # unarchived links are not considered corrupt
-        return False
-
-    if is_valid(link):
-        return False
-
-    return True
+    return False if not Path(link.link_dir).exists() else not is_valid(link)
 
 def is_archived(link: Link) -> bool:
     return is_valid(link) and link.is_archived
     
 def is_unarchived(link: Link) -> bool:
-    if not Path(link.link_dir).exists():
-        return True
-    return not link.is_archived
+    return True if not Path(link.link_dir).exists() else not link.is_archived
 
 
 def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]: