From 36aa67d89c238b0f33951c7766356005968c5e49 Mon Sep 17 00:00:00 2001 From: Mashiat Sarker Shakkhar Date: Fri, 22 May 2020 00:55:53 -0400 Subject: [PATCH 1/2] logging.py: Import stderr from config. The function `reject_stdin()` uses the function `stderr` defined in the config module but it was never imported. --- archivebox/cli/logging.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivebox/cli/logging.py b/archivebox/cli/logging.py index 88c472e7..5e499800 100644 --- a/archivebox/cli/logging.py +++ b/archivebox/cli/logging.py @@ -22,6 +22,7 @@ from ..config import ( IS_TTY, SHOW_PROGRESS, TERM_WIDTH, + stderr, ) From 3fa593a5f6b2ce580ddaba7d5949e4357d1c6612 Mon Sep 17 00:00:00 2001 From: Mashiat Sarker Shakkhar Date: Fri, 22 May 2020 15:41:42 -0400 Subject: [PATCH 2/2] util: Don't choke on unix timestamp string while parsing date. Some older version of ArchiveBox stored unix timestamp as string in the index instead of float. This created problem for me during migration while running `archivebox init` on an archive created with a previous revision of ArchiveBox. The archive that I have is in otherwise good condition (not malformed or garbaled). While we don't retain the old behavior, I think it is sane to at least retain backward compatibility here while migrating. Unfortunately, I don't have the exact revision that I used to create the old archive - so I cannot be sure how I ended up with unix timestamp as string in my index. Additionally, convert unix timestamp into datetime. I think it's a bug that we coerce float / int (presumably unix timestamp) into string. These need to be converted to datetime. --- archivebox/util.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/archivebox/util.py b/archivebox/util.py index 5a4ec88c..da1e9b7b 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -136,12 +136,18 @@ def parse_date(date: Any) -> Optional[datetime]: if isinstance(date, datetime): return date - - if isinstance(date, (float, int)): - date = str(date) if isinstance(date, str): - return dateparser.parse(date) + try: + return dateparser.parse(date) + except dateparser._parser.ParserError: + # I assume that if the string is not parsable as date / time, it + # is a unix timestampi. While it's possible that the string is + # garbage, there's no way we can automatically fix that. + date = float(date) + + if isinstance(date, (float, int)): + return datetime.utcfromtimestamp(date) raise ValueError('Tried to parse invalid date! {}'.format(date))