From 19682628f799f3c67013fc15f206f8d1b38b6ff7 Mon Sep 17 00:00:00 2001 From: Sourcery AI <> Date: Tue, 8 Aug 2023 20:41:11 +0000 Subject: [PATCH] 'Refactored by Sourcery' --- archivebox/config.py | 436 ++++++++++++++++++++++---------------- archivebox/core/models.py | 60 +++--- 2 files changed, 284 insertions(+), 212 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 36b383e1..b820da3b 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -19,6 +19,7 @@ Documentation: """ + __package__ = 'archivebox' import os @@ -65,168 +66,250 @@ except ModuleNotFoundError: CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SHELL_CONFIG': { - 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()}, - 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']}, - 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now - 'IN_DOCKER': {'type': bool, 'default': False}, - 'PUID': {'type': int, 'default': os.getuid()}, - 'PGID': {'type': int, 'default': os.getgid()}, + 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()}, + 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']}, + 'SHOW_PROGRESS': { + 'type': bool, + 'default': lambda c: ( + c['IS_TTY'] and platform.system() != 'Darwin' + ), + }, # progress bars are buggy on mac, disable for now + 'IN_DOCKER': {'type': bool, 'default': False}, + 'PUID': {'type': int, 'default': os.getuid()}, + 'PGID': {'type': int, 'default': os.getgid()}, # TODO: 'SHOW_HINTS': {'type: bool, 'default': True}, }, - 'GENERAL_CONFIG': { - 'OUTPUT_DIR': {'type': str, 'default': None}, - 'CONFIG_FILE': {'type': str, 'default': None}, - 'ONLY_NEW': {'type': bool, 'default': True}, - 'TIMEOUT': {'type': int, 'default': 60}, - 'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, - 'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'}, - 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, - 'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages - 'URL_WHITELIST': {'type': str, 'default': None}, - 'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True}, - 'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'}, - 'USE_TIMESTAMP_AS_ADDED': {'type': bool, 'default': False}, + 'OUTPUT_DIR': {'type': str, 'default': None}, + 'CONFIG_FILE': {'type': str, 'default': None}, + 'ONLY_NEW': {'type': bool, 'default': True}, + 'TIMEOUT': {'type': int, 'default': 60}, + 'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, + 'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'}, + 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, + 'URL_BLACKLIST': { + 'type': str, + 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', + }, # to avoid downloading code assets as their own pages + 'URL_WHITELIST': {'type': str, 'default': None}, + 'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True}, + 'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'}, + 'USE_TIMESTAMP_AS_ADDED': {'type': bool, 'default': False}, }, - 'SERVER_CONFIG': { - 'SECRET_KEY': {'type': str, 'default': None}, - 'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]}, - 'ALLOWED_HOSTS': {'type': str, 'default': '*'}, - 'DEBUG': {'type': bool, 'default': False}, - 'PUBLIC_INDEX': {'type': bool, 'default': True}, - 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, - 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, - 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, - 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, - 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, - 'TIME_ZONE': {'type': str, 'default': 'UTC'}, - 'TIMEZONE': {'type': str, 'default': 'UTC'}, - 'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'}, - 'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''}, - 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, - 'PREVIEW_ORIGINALS': {'type': bool, 'default': True}, - 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, + 'SECRET_KEY': {'type': str, 'default': None}, + 'BIND_ADDR': { + 'type': str, + 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][ + c['IN_DOCKER'] + ], + }, + 'ALLOWED_HOSTS': {'type': str, 'default': '*'}, + 'DEBUG': {'type': bool, 'default': False}, + 'PUBLIC_INDEX': {'type': bool, 'default': True}, + 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, + 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, + 'FOOTER_INFO': { + 'type': str, + 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.', + }, + 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, + 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, + 'TIME_ZONE': {'type': str, 'default': 'UTC'}, + 'TIMEZONE': {'type': str, 'default': 'UTC'}, + 'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'}, + 'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''}, + 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, + 'PREVIEW_ORIGINALS': {'type': bool, 'default': True}, + 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, }, - 'ARCHIVE_METHOD_TOGGLES': { - 'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)}, - 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)}, - 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)}, - 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)}, - 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, - 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)}, - 'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)}, - 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, - 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, - 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, - 'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)}, - 'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)}, - 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)}, - 'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)}, - 'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)}, + 'SAVE_TITLE': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_TITLE',), + }, + 'SAVE_FAVICON': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_FAVICON',), + }, + 'SAVE_WGET': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_WGET',), + }, + 'SAVE_WGET_REQUISITES': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_WGET_REQUISITES',), + }, + 'SAVE_SINGLEFILE': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_SINGLEFILE',), + }, + 'SAVE_READABILITY': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_READABILITY',), + }, + 'SAVE_MERCURY': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_MERCURY',), + }, + 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, + 'SAVE_SCREENSHOT': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_SCREENSHOT',), + }, + 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, + 'SAVE_HEADERS': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_HEADERS',), + }, + 'SAVE_WARC': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_WARC',), + }, + 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)}, + 'SAVE_MEDIA': { + 'type': bool, + 'default': True, + 'aliases': ('FETCH_MEDIA',), + }, + 'SAVE_ARCHIVE_DOT_ORG': { + 'type': bool, + 'default': True, + 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',), + }, }, - 'ARCHIVE_METHOD_OPTIONS': { - 'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)}, - 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'}, - 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, - 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, - - 'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'}, - 'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'}, - 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, - - 'COOKIES_FILE': {'type': str, 'default': None}, - 'CHROME_USER_DATA_DIR': {'type': str, 'default': None}, - - 'CHROME_TIMEOUT': {'type': int, 'default': 0}, - 'CHROME_HEADLESS': {'type': bool, 'default': True}, - 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']}, - 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [ - '--write-description', - '--write-info-json', - '--write-annotations', - '--write-thumbnail', - '--no-call-home', - '--write-sub', - '--all-subs', - # There are too many of these and youtube - # throttles you with HTTP error 429 - #'--write-auto-subs', - '--convert-subs=srt', - '--yes-playlist', - '--continue', - # This flag doesn't exist in youtube-dl - # only in yt-dlp - '--no-abort-on-error', - # --ignore-errors must come AFTER - # --no-abort-on-error - # https://github.com/yt-dlp/yt-dlp/issues/4914 - '--ignore-errors', - '--geo-bypass', - '--add-metadata', - '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']), - ]}, - - - 'WGET_ARGS': {'type': list, 'default': ['--no-verbose', - '--adjust-extension', - '--convert-links', - '--force-directories', - '--backup-converted', - '--span-hosts', - '--no-parent', - '-e', 'robots=off', - ]}, - 'CURL_ARGS': {'type': list, 'default': ['--silent', - '--location', - '--compressed' - ]}, - 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, - 'SINGLEFILE_ARGS': {'type': list, 'default' : None}, - 'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'}, + 'RESOLUTION': { + 'type': str, + 'default': '1440,2000', + 'aliases': ('SCREENSHOT_RESOLUTION',), + }, + 'GIT_DOMAINS': { + 'type': str, + 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com', + }, + 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, + 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, + 'CURL_USER_AGENT': { + 'type': str, + 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}', + }, + 'WGET_USER_AGENT': { + 'type': str, + 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}', + }, + 'CHROME_USER_AGENT': { + 'type': str, + 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)', + }, + 'COOKIES_FILE': {'type': str, 'default': None}, + 'CHROME_USER_DATA_DIR': {'type': str, 'default': None}, + 'CHROME_TIMEOUT': {'type': int, 'default': 0}, + 'CHROME_HEADLESS': {'type': bool, 'default': True}, + 'CHROME_SANDBOX': { + 'type': bool, + 'default': lambda c: not c['IN_DOCKER'], + }, + 'YOUTUBEDL_ARGS': { + 'type': list, + 'default': lambda c: [ + '--write-description', + '--write-info-json', + '--write-annotations', + '--write-thumbnail', + '--no-call-home', + '--write-sub', + '--all-subs', + '--convert-subs=srt', + '--yes-playlist', + '--continue', + '--no-abort-on-error', + '--ignore-errors', + '--geo-bypass', + '--add-metadata', + f"--max-filesize={c['MEDIA_MAX_SIZE']}", + ], + }, + 'WGET_ARGS': { + 'type': list, + 'default': [ + '--no-verbose', + '--adjust-extension', + '--convert-links', + '--force-directories', + '--backup-converted', + '--span-hosts', + '--no-parent', + '-e', + 'robots=off', + ], + }, + 'CURL_ARGS': { + 'type': list, + 'default': ['--silent', '--location', '--compressed'], + }, + 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, + 'SINGLEFILE_ARGS': {'type': list, 'default': None}, + 'FAVICON_PROVIDER': { + 'type': str, + 'default': 'https://www.google.com/s2/favicons?domain={}', + }, }, - - 'SEARCH_BACKEND_CONFIG' : { - 'USE_INDEXING_BACKEND': {'type': bool, 'default': True}, - 'USE_SEARCHING_BACKEND': {'type': bool, 'default': True}, - 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'}, - 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, - 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, - 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, + 'SEARCH_BACKEND_CONFIG': { + 'USE_INDEXING_BACKEND': {'type': bool, 'default': True}, + 'USE_SEARCHING_BACKEND': {'type': bool, 'default': True}, + 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'}, + 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, + 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, + 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, # SONIC - 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'}, - 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, - 'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90}, + 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'}, + 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, + 'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90}, }, - 'DEPENDENCY_CONFIG': { - 'USE_CURL': {'type': bool, 'default': True}, - 'USE_WGET': {'type': bool, 'default': True}, - 'USE_SINGLEFILE': {'type': bool, 'default': True}, - 'USE_READABILITY': {'type': bool, 'default': True}, - 'USE_MERCURY': {'type': bool, 'default': True}, - 'USE_GIT': {'type': bool, 'default': True}, - 'USE_CHROME': {'type': bool, 'default': True}, - 'USE_NODE': {'type': bool, 'default': True}, - 'USE_YOUTUBEDL': {'type': bool, 'default': True}, - 'USE_RIPGREP': {'type': bool, 'default': True}, - - 'CURL_BINARY': {'type': str, 'default': 'curl'}, - 'GIT_BINARY': {'type': str, 'default': 'git'}, - 'WGET_BINARY': {'type': str, 'default': 'wget'}, - 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, - 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')}, - 'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')}, + 'USE_CURL': {'type': bool, 'default': True}, + 'USE_WGET': {'type': bool, 'default': True}, + 'USE_SINGLEFILE': {'type': bool, 'default': True}, + 'USE_READABILITY': {'type': bool, 'default': True}, + 'USE_MERCURY': {'type': bool, 'default': True}, + 'USE_GIT': {'type': bool, 'default': True}, + 'USE_CHROME': {'type': bool, 'default': True}, + 'USE_NODE': {'type': bool, 'default': True}, + 'USE_YOUTUBEDL': {'type': bool, 'default': True}, + 'USE_RIPGREP': {'type': bool, 'default': True}, + 'CURL_BINARY': {'type': str, 'default': 'curl'}, + 'GIT_BINARY': {'type': str, 'default': 'git'}, + 'WGET_BINARY': {'type': str, 'default': 'wget'}, + 'SINGLEFILE_BINARY': { + 'type': str, + 'default': lambda c: bin_path('single-file'), + }, + 'READABILITY_BINARY': { + 'type': str, + 'default': lambda c: bin_path('readability-extractor'), + }, + 'MERCURY_BINARY': { + 'type': str, + 'default': lambda c: bin_path('mercury-parser'), + }, #'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, - 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, - 'NODE_BINARY': {'type': str, 'default': 'node'}, - 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, - 'CHROME_BINARY': {'type': str, 'default': None}, - - 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, - 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, + 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, + 'NODE_BINARY': {'type': str, 'default': 'node'}, + 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, + 'CHROME_BINARY': {'type': str, 'default': None}, + 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, + 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, }, } @@ -275,7 +358,7 @@ DEFAULT_CLI_COLORS = { 'white': '\033[01;37m', 'black': '\033[01;30m', } -ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()} +ANSI = {k: '' for k in DEFAULT_CLI_COLORS} COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], { '00': [(0, 0, 0), (0, 0, 0)], @@ -515,15 +598,11 @@ def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]: config_file = ConfigParser() config_file.optionxform = str config_file.read(config_path) - # flatten into one namespace - config_file_vars = { + return { key.upper(): val for section, options in config_file.items() - for key, val in options.items() + for key, val in options.items() } - # print('[i] Loaded config file', os.path.abspath(config_path)) - # print(config_file_vars) - return config_file_vars return None @@ -532,8 +611,12 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: from .system import atomic_write - CONFIG_HEADER = ( - """# This is the config file for your ArchiveBox collection. + out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() + config_path = Path(out_dir) / CONFIG_FILENAME + + if not config_path.exists(): + CONFIG_HEADER = ( + """# This is the config file for your ArchiveBox collection. # # You can add options here manually in INI format, or automatically by running: # archivebox config --set KEY=VALUE @@ -546,10 +629,6 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: """) - out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() - config_path = Path(out_dir) / CONFIG_FILENAME - - if not config_path.exists(): atomic_write(config_path, CONFIG_HEADER) config_file = ConfigParser() @@ -564,10 +643,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: # Set up sections in empty config file for key, val in config.items(): section = find_section(key) - if section in config_file: - existing_config = dict(config_file[section]) - else: - existing_config = {} + existing_config = dict(config_file[section]) if section in config_file else {} config_file[section] = {**existing_config, key: val} # always make sure there's a SECRET_KEY defined for Django @@ -600,10 +676,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: if Path(f'{config_path}.bak').exists(): os.remove(f'{config_path}.bak') - return { - key.upper(): CONFIG.get(key.upper()) - for key in config.keys() - } + return {key.upper(): CONFIG.get(key.upper()) for key in config} @@ -633,7 +706,7 @@ def load_config(defaults: ConfigDefaultDict, except Exception as e: stderr() stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config) - stderr(' {}: {}'.format(e.__class__.__name__, e)) + stderr(f' {e.__class__.__name__}: {e}') stderr() stderr(' Check your config for mistakes and try again (your archive data is unaffected).') stderr() @@ -679,7 +752,7 @@ def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Op else: stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi)) for line in text[1:]: - stderr('{} {}'.format(prefix, line)) + stderr(f'{prefix} {line}') # Dependency Metadata Helpers @@ -750,8 +823,7 @@ def find_chrome_binary() -> Optional[str]: 'google-chrome-dev', ) for name in default_executable_paths: - full_path_exists = shutil.which(name) - if full_path_exists: + if full_path_exists := shutil.which(name): return name return None @@ -789,7 +861,7 @@ def wget_supports_compression(config): "--help", ] return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode - except (FileNotFoundError, OSError): + except OSError: return False def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: @@ -1100,7 +1172,9 @@ def check_system_config(config: ConfigDict=CONFIG) -> None: if '/Default' in str(config['CHROME_USER_DATA_DIR']): stderr() stderr(' Try removing /Default from the end e.g.:') - stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0])) + stderr( + f""" CHROME_USER_DATA_DIR="{config['CHROME_USER_DATA_DIR'].split('/Default')[0]}\"""" + ) raise SystemExit(2) @@ -1113,11 +1187,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow') for dependency, info in invalid_dependencies: stderr( - ' ! {}: {} ({})'.format( - dependency, - info['path'] or 'unable to find binary', - info['version'] or 'unable to detect version', - ) + f" ! {dependency}: {info['path'] or 'unable to find binary'} ({info['version'] or 'unable to detect version'})" ) if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'): hint(('To install all packages automatically run: archivebox setup', @@ -1174,9 +1244,9 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO output_dir = out_dir or config['OUTPUT_DIR'] from .index.sql import list_migrations - pending_migrations = [name for status, name in list_migrations() if not status] - - if pending_migrations: + if pending_migrations := [ + name for status, name in list_migrations() if not status + ]: stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow') stderr(f' {output_dir}') stderr() diff --git a/archivebox/core/models.py b/archivebox/core/models.py index faa48e75..7aeae252 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -63,26 +63,25 @@ class Tag(models.Model): return slug def save(self, *args, **kwargs): - if self._state.adding and not self.slug: - self.slug = self.slugify(self.name) - - # if name is different but slug conficts with another tags slug, append a counter - # with transaction.atomic(): - slugs = set( - type(self) - ._default_manager.filter(slug__startswith=self.slug) - .values_list("slug", flat=True) - ) - - i = None - while True: - slug = self.slugify(self.name, i) - if slug not in slugs: - self.slug = slug - return super().save(*args, **kwargs) - i = 1 if i is None else i+1 - else: + if not self._state.adding or self.slug: return super().save(*args, **kwargs) + self.slug = self.slugify(self.name) + + # if name is different but slug conficts with another tags slug, append a counter + # with transaction.atomic(): + slugs = set( + type(self) + ._default_manager.filter(slug__startswith=self.slug) + .values_list("slug", flat=True) + ) + + i = None + while True: + slug = self.slugify(self.name, i) + if slug not in slugs: + self.slug = slug + return super().save(*args, **kwargs) + i = 1 if i is None else i+1 class Snapshot(models.Model): @@ -175,7 +174,7 @@ class Snapshot(models.Model): @cached_property def archive_path(self): - return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp) + return f'{ARCHIVE_DIR_NAME}/{self.timestamp}' @cached_property def archive_size(self): @@ -191,11 +190,13 @@ class Snapshot(models.Model): @cached_property def thumbnail_url(self) -> Optional[str]: - result = self.archiveresult_set.filter( - extractor='screenshot', - status='succeeded' - ).only('output').last() - if result: + if ( + result := self.archiveresult_set.filter( + extractor='screenshot', status='succeeded' + ) + .only('output') + .last() + ): return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}']) return None @@ -248,10 +249,11 @@ class Snapshot(models.Model): return None def save_tags(self, tags: List[str]=()) -> None: - tags_id = [] - for tag in tags: - if tag.strip(): - tags_id.append(Tag.objects.get_or_create(name=tag)[0].id) + tags_id = [ + Tag.objects.get_or_create(name=tag)[0].id + for tag in tags + if tag.strip() + ] self.tags.clear() self.tags.add(*tags_id)