From 771d5ad676f98b75de3c05efb4860f64b96876dd Mon Sep 17 00:00:00 2001 From: Sourcery AI <> Date: Wed, 2 Aug 2023 14:57:45 +0000 Subject: [PATCH] 'Refactored by Sourcery' --- archivebox/config.py | 449 ++++++++++++++++++------------ archivebox/extractors/__init__.py | 21 +- archivebox/index/__init__.py | 45 ++- 3 files changed, 293 insertions(+), 222 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 7334b169..87ae38c4 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -19,6 +19,7 @@ Documentation: """ + __package__ = 'archivebox' import os @@ -65,169 +66,262 @@ except ModuleNotFoundError: CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SHELL_CONFIG': { - 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()}, - 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']}, - 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now - 'IN_DOCKER': {'type': bool, 'default': False}, - 'PUID': {'type': int, 'default': os.getuid()}, - 'PGID': {'type': int, 'default': os.getgid()}, + 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()}, + 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']}, + 'SHOW_PROGRESS': { + 'type': bool, + 'default': lambda c: ( + c['IS_TTY'] and platform.system() != 'Darwin' + ), + }, # progress bars are buggy on mac, disable for now + 'IN_DOCKER': {'type': bool, 'default': False}, + 'PUID': {'type': int, 'default': os.getuid()}, + 'PGID': {'type': int, 'default': os.getgid()}, # TODO: 'SHOW_HINTS': {'type: bool, 'default': True}, }, - 'GENERAL_CONFIG': { - 'OUTPUT_DIR': {'type': str, 'default': None}, - 'CONFIG_FILE': {'type': str, 'default': None}, - 'ONLY_NEW': {'type': bool, 'default': True}, - 'TIMEOUT': {'type': int, 'default': 60}, - 'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, - 'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'}, - 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, - 'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages - 'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)}, - 'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True}, - 'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'}, + 'OUTPUT_DIR': {'type': str, 'default': None}, + 'CONFIG_FILE': {'type': str, 'default': None}, + 'ONLY_NEW': {'type': bool, 'default': True}, + 'TIMEOUT': {'type': int, 'default': 60}, + 'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, + 'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'}, + 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, + 'URL_DENYLIST': { + 'type': str, + 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', + 'aliases': ('URL_BLACKLIST',), + }, # to avoid downloading code assets as their own pages + 'URL_ALLOWLIST': { + 'type': str, + 'default': None, + 'aliases': ('URL_WHITELIST',), + }, + 'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True}, + 'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'}, }, - 'SERVER_CONFIG': { - 'SECRET_KEY': {'type': str, 'default': None}, - 'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]}, - 'ALLOWED_HOSTS': {'type': str, 'default': '*'}, - 'DEBUG': {'type': bool, 'default': False}, - 'PUBLIC_INDEX': {'type': bool, 'default': True}, - 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, - 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, - 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, - 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, - 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, - 'TIME_ZONE': {'type': str, 'default': 'UTC'}, - 'TIMEZONE': {'type': str, 'default': 'UTC'}, - 'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'}, - 'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''}, - 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, - 'PREVIEW_ORIGINALS': {'type': bool, 'default': True}, - 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, + 'SECRET_KEY': {'type': str, 'default': None}, + 'BIND_ADDR': { + 'type': str, + 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][ + c['IN_DOCKER'] + ], + }, + 'ALLOWED_HOSTS': {'type': str, 'default': '*'}, + 'DEBUG': {'type': bool, 'default': False}, + 'PUBLIC_INDEX': {'type': bool, 'default': True}, + 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, + 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, + 'FOOTER_INFO': { + 'type': str, + 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.', + }, + 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, + 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, + 'TIME_ZONE': {'type': str, 'default': 'UTC'}, + 'TIMEZONE': {'type': str, 'default': 'UTC'}, + 'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'}, + 'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''}, + 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, + 'PREVIEW_ORIGINALS': {'type': bool, 'default': True}, + 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, }, - 'ARCHIVE_METHOD_TOGGLES': { - 'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)}, - 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)}, - 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)}, - 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)}, - 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, - 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)}, - 'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)}, - 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, - 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, - 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, - 'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)}, - 'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)}, - 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)}, - 'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)}, - 'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)}, - 'SAVE_ALLOWLIST': {'type': dict, 'default': {},}, - 'SAVE_DENYLIST': {'type': dict, 'default': {},}, + 'SAVE_TITLE': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_TITLE',), + }, + 'SAVE_FAVICON': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_FAVICON',), + }, + 'SAVE_WGET': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_WGET',), + }, + 'SAVE_WGET_REQUISITES': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_WGET_REQUISITES',), + }, + 'SAVE_SINGLEFILE': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_SINGLEFILE',), + }, + 'SAVE_READABILITY': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_READABILITY',), + }, + 'SAVE_MERCURY': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_MERCURY',), + }, + 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, + 'SAVE_SCREENSHOT': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_SCREENSHOT',), + }, + 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, + 'SAVE_HEADERS': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_HEADERS',), + }, + 'SAVE_WARC': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_WARC',), + }, + 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)}, + 'SAVE_MEDIA': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_MEDIA',), + }, + 'SAVE_ARCHIVE_DOT_ORG': { + 'type': bool, + 'default': True, + 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',), + }, + 'SAVE_ALLOWLIST': { + 'type': dict, + 'default': {}, + }, + 'SAVE_DENYLIST': { + 'type': dict, + 'default': {}, + }, }, - 'ARCHIVE_METHOD_OPTIONS': { - 'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)}, - 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'}, - 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, - 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, - - 'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'}, - 'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'}, - 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, - - 'COOKIES_FILE': {'type': str, 'default': None}, - 'CHROME_USER_DATA_DIR': {'type': str, 'default': None}, - - 'CHROME_TIMEOUT': {'type': int, 'default': 0}, - 'CHROME_HEADLESS': {'type': bool, 'default': True}, - 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']}, - 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [ - '--write-description', - '--write-info-json', - '--write-annotations', - '--write-thumbnail', - '--no-call-home', - '--write-sub', - '--all-subs', - # There are too many of these and youtube - # throttles you with HTTP error 429 - #'--write-auto-subs', - '--convert-subs=srt', - '--yes-playlist', - '--continue', - # This flag doesn't exist in youtube-dl - # only in yt-dlp - '--no-abort-on-error', - # --ignore-errors must come AFTER - # --no-abort-on-error - # https://github.com/yt-dlp/yt-dlp/issues/4914 - '--ignore-errors', - '--geo-bypass', - '--add-metadata', - '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']), - ]}, - - - 'WGET_ARGS': {'type': list, 'default': ['--no-verbose', - '--adjust-extension', - '--convert-links', - '--force-directories', - '--backup-converted', - '--span-hosts', - '--no-parent', - '-e', 'robots=off', - ]}, - 'CURL_ARGS': {'type': list, 'default': ['--silent', - '--location', - '--compressed' - ]}, - 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, - 'SINGLEFILE_ARGS': {'type': list, 'default' : None}, - 'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'}, + 'RESOLUTION': { + 'type': str, + 'default': '1440,2000', + 'aliases': ('SCREENSHOT_RESOLUTION',), + }, + 'GIT_DOMAINS': { + 'type': str, + 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com', + }, + 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, + 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, + 'CURL_USER_AGENT': { + 'type': str, + 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}', + }, + 'WGET_USER_AGENT': { + 'type': str, + 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}', + }, + 'CHROME_USER_AGENT': { + 'type': str, + 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)', + }, + 'COOKIES_FILE': {'type': str, 'default': None}, + 'CHROME_USER_DATA_DIR': {'type': str, 'default': None}, + 'CHROME_TIMEOUT': {'type': int, 'default': 0}, + 'CHROME_HEADLESS': {'type': bool, 'default': True}, + 'CHROME_SANDBOX': { + 'type': bool, + 'default': lambda c: not c['IN_DOCKER'], + }, + 'YOUTUBEDL_ARGS': { + 'type': list, + 'default': lambda c: [ + '--write-description', + '--write-info-json', + '--write-annotations', + '--write-thumbnail', + '--no-call-home', + '--write-sub', + '--all-subs', + '--convert-subs=srt', + '--yes-playlist', + '--continue', + '--no-abort-on-error', + '--ignore-errors', + '--geo-bypass', + '--add-metadata', + f"--max-filesize={c['MEDIA_MAX_SIZE']}", + ], + }, + 'WGET_ARGS': { + 'type': list, + 'default': [ + '--no-verbose', + '--adjust-extension', + '--convert-links', + '--force-directories', + '--backup-converted', + '--span-hosts', + '--no-parent', + '-e', + 'robots=off', + ], + }, + 'CURL_ARGS': { + 'type': list, + 'default': ['--silent', '--location', '--compressed'], + }, + 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, + 'SINGLEFILE_ARGS': {'type': list, 'default': None}, + 'FAVICON_PROVIDER': { + 'type': str, + 'default': 'https://www.google.com/s2/favicons?domain={}', + }, }, - - 'SEARCH_BACKEND_CONFIG' : { - 'USE_INDEXING_BACKEND': {'type': bool, 'default': True}, - 'USE_SEARCHING_BACKEND': {'type': bool, 'default': True}, - 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'}, - 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, - 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, - 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, + 'SEARCH_BACKEND_CONFIG': { + 'USE_INDEXING_BACKEND': {'type': bool, 'default': True}, + 'USE_SEARCHING_BACKEND': {'type': bool, 'default': True}, + 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'}, + 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, + 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, + 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, # SONIC - 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'}, - 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, - 'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90}, + 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'}, + 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, + 'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90}, }, - 'DEPENDENCY_CONFIG': { - 'USE_CURL': {'type': bool, 'default': True}, - 'USE_WGET': {'type': bool, 'default': True}, - 'USE_SINGLEFILE': {'type': bool, 'default': True}, - 'USE_READABILITY': {'type': bool, 'default': True}, - 'USE_MERCURY': {'type': bool, 'default': True}, - 'USE_GIT': {'type': bool, 'default': True}, - 'USE_CHROME': {'type': bool, 'default': True}, - 'USE_NODE': {'type': bool, 'default': True}, - 'USE_YOUTUBEDL': {'type': bool, 'default': True}, - 'USE_RIPGREP': {'type': bool, 'default': True}, - - 'CURL_BINARY': {'type': str, 'default': 'curl'}, - 'GIT_BINARY': {'type': str, 'default': 'git'}, - 'WGET_BINARY': {'type': str, 'default': 'wget'}, - 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, - 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')}, - 'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')}, + 'USE_CURL': {'type': bool, 'default': True}, + 'USE_WGET': {'type': bool, 'default': True}, + 'USE_SINGLEFILE': {'type': bool, 'default': True}, + 'USE_READABILITY': {'type': bool, 'default': True}, + 'USE_MERCURY': {'type': bool, 'default': True}, + 'USE_GIT': {'type': bool, 'default': True}, + 'USE_CHROME': {'type': bool, 'default': True}, + 'USE_NODE': {'type': bool, 'default': True}, + 'USE_YOUTUBEDL': {'type': bool, 'default': True}, + 'USE_RIPGREP': {'type': bool, 'default': True}, + 'CURL_BINARY': {'type': str, 'default': 'curl'}, + 'GIT_BINARY': {'type': str, 'default': 'git'}, + 'WGET_BINARY': {'type': str, 'default': 'wget'}, + 'SINGLEFILE_BINARY': { + 'type': str, + 'default': lambda c: bin_path('single-file'), + }, + 'READABILITY_BINARY': { + 'type': str, + 'default': lambda c: bin_path('readability-extractor'), + }, + 'MERCURY_BINARY': { + 'type': str, + 'default': lambda c: bin_path('mercury-parser'), + }, #'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, - 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, - 'NODE_BINARY': {'type': str, 'default': 'node'}, - 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, - 'CHROME_BINARY': {'type': str, 'default': None}, - - 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, - 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, + 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, + 'NODE_BINARY': {'type': str, 'default': 'node'}, + 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, + 'CHROME_BINARY': {'type': str, 'default': None}, + 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, + 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, }, } @@ -276,7 +370,7 @@ DEFAULT_CLI_COLORS = { 'white': '\033[01;37m', 'black': '\033[01;30m', } -ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()} +ANSI = {k: '' for k in DEFAULT_CLI_COLORS} COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], { '00': [(0, 0, 0), (0, 0, 0)], @@ -519,15 +613,11 @@ def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]: config_file = ConfigParser() config_file.optionxform = str config_file.read(config_path) - # flatten into one namespace - config_file_vars = { + return { key.upper(): val for section, options in config_file.items() - for key, val in options.items() + for key, val in options.items() } - # print('[i] Loaded config file', os.path.abspath(config_path)) - # print(config_file_vars) - return config_file_vars return None @@ -536,8 +626,12 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: from .system import atomic_write - CONFIG_HEADER = ( - """# This is the config file for your ArchiveBox collection. + out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() + config_path = Path(out_dir) / CONFIG_FILENAME + + if not config_path.exists(): + CONFIG_HEADER = ( + """# This is the config file for your ArchiveBox collection. # # You can add options here manually in INI format, or automatically by running: # archivebox config --set KEY=VALUE @@ -550,10 +644,6 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: """) - out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() - config_path = Path(out_dir) / CONFIG_FILENAME - - if not config_path.exists(): atomic_write(config_path, CONFIG_HEADER) config_file = ConfigParser() @@ -568,10 +658,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: # Set up sections in empty config file for key, val in config.items(): section = find_section(key) - if section in config_file: - existing_config = dict(config_file[section]) - else: - existing_config = {} + existing_config = dict(config_file[section]) if section in config_file else {} config_file[section] = {**existing_config, key: val} # always make sure there's a SECRET_KEY defined for Django @@ -604,10 +691,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: if Path(f'{config_path}.bak').exists(): os.remove(f'{config_path}.bak') - return { - key.upper(): CONFIG.get(key.upper()) - for key in config.keys() - } + return {key.upper(): CONFIG.get(key.upper()) for key in config} @@ -637,7 +721,7 @@ def load_config(defaults: ConfigDefaultDict, except Exception as e: stderr() stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config) - stderr(' {}: {}'.format(e.__class__.__name__, e)) + stderr(f' {e.__class__.__name__}: {e}') stderr() stderr(' Check your config for mistakes and try again (your archive data is unaffected).') stderr() @@ -683,7 +767,7 @@ def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Op else: stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi)) for line in text[1:]: - stderr('{} {}'.format(prefix, line)) + stderr(f'{prefix} {line}') # Dependency Metadata Helpers @@ -754,8 +838,7 @@ def find_chrome_binary() -> Optional[str]: 'google-chrome-dev', ) for name in default_executable_paths: - full_path_exists = shutil.which(name) - if full_path_exists: + if full_path_exists := shutil.which(name): return name return None @@ -793,7 +876,7 @@ def wget_supports_compression(config): "--help", ] return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode - except (FileNotFoundError, OSError): + except OSError: return False def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: @@ -1104,7 +1187,9 @@ def check_system_config(config: ConfigDict=CONFIG) -> None: if '/Default' in str(config['CHROME_USER_DATA_DIR']): stderr() stderr(' Try removing /Default from the end e.g.:') - stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0])) + stderr( + f""" CHROME_USER_DATA_DIR="{config['CHROME_USER_DATA_DIR'].split('/Default')[0]}\"""" + ) raise SystemExit(2) @@ -1117,11 +1202,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow') for dependency, info in invalid_dependencies: stderr( - ' ! {}: {} ({})'.format( - dependency, - info['path'] or 'unable to find binary', - info['version'] or 'unable to detect version', - ) + f" ! {dependency}: {info['path'] or 'unable to find binary'} ({info['version'] or 'unable to detect version'})" ) if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'): hint(('To install all packages automatically run: archivebox setup', @@ -1178,9 +1259,9 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO output_dir = out_dir or config['OUTPUT_DIR'] from .index.sql import list_migrations - pending_migrations = [name for status, name in list_migrations() if not status] - - if pending_migrations: + if pending_migrations := [ + name for status, name in list_migrations() if not status + ]: stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow') stderr(f' {output_dir}') stderr() diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 38710182..f43263b4 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -105,7 +105,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s snapshot = write_link_to_sql_index(link) active_methods = get_archive_methods_for_link(link) - + if methods: active_methods = [ method for method in active_methods @@ -168,13 +168,16 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s with open(ERROR_LOG, "a", encoding='utf-8') as f: command = ' '.join(sys.argv) ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') - f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format( - method_name, - link.url, - command, - ts - ) + "\n")) - #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") + f.write( + ( + ( + "\n" + + f'Exception in archive_methods.save_{method_name}(Link(url={link.url})) command={command}; ts={ts}' + ) + + "\n" + ) + ) + #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") # print(' ', stats) @@ -197,7 +200,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s raise except Exception as err: - print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) + print(f' ! Failed to archive link: {err.__class__.__name__}: {err}') raise return link diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index b9d57aeb..a8390c0e 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -85,10 +85,9 @@ def merge_links(a: Link, b: Link) -> Link: ) # all unique, truthy tags - tags_set = ( - set(tag.strip() for tag in (a.tags or '').split(',')) - | set(tag.strip() for tag in (b.tags or '').split(',')) - ) + tags_set = {tag.strip() for tag in (a.tags or '').split(',')} | { + tag.strip() for tag in (b.tags or '').split(',') + } tags = ','.join(tags_set) or None # all unique source entries @@ -199,10 +198,10 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str: if timestamp not in used_timestamps: return timestamp - new_timestamp = '{}.{}'.format(timestamp, nonce) + new_timestamp = f'{timestamp}.{nonce}' while new_timestamp in used_timestamps: nonce += 1 - new_timestamp = '{}.{}'.format(timestamp, nonce) + new_timestamp = f'{timestamp}.{nonce}' return new_timestamp @@ -292,8 +291,7 @@ def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> unique_urls: OrderedDict[str, Link] = OrderedDict() for link in links: - index_link = snapshots.filter(url=link.url) - if index_link: + if index_link := snapshots.filter(url=link.url): link = merge_links(index_link[0].as_link(), link) unique_urls[link.url] = link @@ -320,7 +318,7 @@ def dedupe_links(snapshots: QuerySet, # Replace links in new_links with the dedup version for i in range(len(new_links)): - if new_links[i].url in dedup_links_dict.keys(): + if new_links[i].url in dedup_links_dict: new_links[i] = dedup_links_dict[new_links[i].url] log_deduping_finished(len(new_links)) @@ -345,8 +343,7 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link: """ out_dir = out_dir or link.link_dir - existing_link = parse_json_link_details(out_dir) - if existing_link: + if existing_link := parse_json_link_details(out_dir): return merge_links(existing_link, link) return link @@ -564,33 +561,23 @@ def is_valid(link: Link) -> bool: if not dir_exists: # unarchived links are not included in the valid list return False - if dir_exists and not index_exists: + if not index_exists: return False - if dir_exists and index_exists: - try: - parsed_link = parse_json_link_details(link.link_dir, guess=True) - return link.url == parsed_link.url - except Exception: - pass + try: + parsed_link = parse_json_link_details(link.link_dir, guess=True) + return link.url == parsed_link.url + except Exception: + pass return False def is_corrupt(link: Link) -> bool: - if not Path(link.link_dir).exists(): - # unarchived links are not considered corrupt - return False - - if is_valid(link): - return False - - return True + return False if not Path(link.link_dir).exists() else not is_valid(link) def is_archived(link: Link) -> bool: return is_valid(link) and link.is_archived def is_unarchived(link: Link) -> bool: - if not Path(link.link_dir).exists(): - return True - return not link.is_archived + return True if not Path(link.link_dir).exists() else not link.is_archived def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]: