From 0f2bd18b2aa84f9d241d1372230f19ad462c932b Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Tue, 3 Jul 2018 17:19:53 -0400 Subject: [PATCH 1/2] Show better breakdown of resources being replayed. Closes #405 --- ipwb/replay.py | 19 +- ipwb/samples/warcs/5mementosAndFroggie.warc | 183 ++++++++++++++++++++ ipwb/webui/index.html | 2 +- 3 files changed, 199 insertions(+), 5 deletions(-) create mode 100644 ipwb/samples/warcs/5mementosAndFroggie.warc diff --git a/ipwb/replay.py b/ipwb/replay.py index 0b617c89..633b433b 100755 --- a/ipwb/replay.py +++ b/ipwb/replay.py @@ -80,8 +80,11 @@ def showWebUI(path): if os.path.exists(iFileAbs): iFile = iFileAbs # Local file + (mCount, uniqueURIRs) = retrieveMemCount(iFile) content = content.replace( - 'MEMCOUNT', str(retrieveMemCount(iFile))) + 'MEMCOUNT', str(mCount)) + content = content.replace( + 'UNIQUE', str(uniqueURIRs)) content = content.replace( 'let uris = []', @@ -836,21 +839,29 @@ def retrieveMemCount(cdxjFilePath=INDEX_FILE): print("Retrieving URI-Ms from {0}".format(cdxjFilePath)) indexFileContents = getIndexFileContents(cdxjFilePath) + errReturn = (0, 0) + if not indexFileContents: - return 0 + return errReturn lines = indexFileContents.strip().split('\n') if not lines: - return 0 + return errReturn mementoCount = 0 + bucket = {} for i, l in enumerate(lines): validCDXJLine = ipwbConfig.isValidCDXJLine(l) metadataRecord = ipwbConfig.isCDXJMetadataRecord(l) if validCDXJLine and not metadataRecord: mementoCount += 1 + surtURI = l.split()[0] + if surtURI not in bucket: + bucket[surtURI] = 1 + else: # Unnecessary to keep count now, maybe useful later + bucket[surtURI] += 1 - return mementoCount + return mementoCount, len(bucket.keys()) def objectifyCDXJData(lines, onlyURI): diff --git a/ipwb/samples/warcs/5mementosAndFroggie.warc b/ipwb/samples/warcs/5mementosAndFroggie.warc new file mode 100644 index 00000000..45aa85dc --- /dev/null +++ b/ipwb/samples/warcs/5mementosAndFroggie.warc @@ -0,0 +1,183 @@ +WARC/1.0 +WARC-Type: warcinfo +WARC-Date: 2017-02-18T10:00:00Z +WARC-Filename: ipwb-memento.warc +WARC-Record-ID: +Content-Type: application/warc-fields +Content-Length: 238 + +software: Fabricated +ip: 127.0.0.1 +hostname: localhost +format: WARC File Format 1.0 +conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf +description: SampleCrawl +robots: ignore +http-header-user-agent: WARCFab/1.0 + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://memento.us/ +WARC-Date: 2014-01-14T10:00:00Z +WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 +WARC-Record-ID: +Content-Type: application/http; msgtype=response +Content-Length: 186 + +HTTP/1.1 200 OK +Server: nginx +Date: Mon, 30 Jan 2017 18:39:49 GMT +Content-Type: text/html +Connection: close +Vary: Accept-Encoding + +Memento for 1/14/2014 10:00am + + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://memento.us/ +WARC-Date: 2014-01-15T10:15:00Z +WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 +WARC-Record-ID: +Content-Type: application/http; msgtype=response +Content-Length: 186 + +HTTP/1.1 200 OK +Server: nginx +Date: Mon, 30 Jan 2017 18:39:49 GMT +Content-Type: text/html +Connection: close +Vary: Accept-Encoding + +Memento for 1/15/2014 10:15am + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://memento.us/ +WARC-Date: 2013-02-02T10:00:00Z +WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 +WARC-Record-ID: +Content-Type: application/http; msgtype=response +Content-Length: 186 + +HTTP/1.1 200 OK +Server: nginx +Date: Mon, 30 Jan 2017 18:39:49 GMT +Content-Type: text/html +Connection: close +Vary: Accept-Encoding + +Memento for 2/2/2013 10:00am + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://memento.us/ +WARC-Date: 2016-12-31T11:00:00Z +WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 +WARC-Record-ID: +Content-Type: application/http; msgtype=response +Content-Length: 187 + +HTTP/1.1 200 OK +Server: nginx +Date: Mon, 30 Jan 2017 18:39:49 GMT +Content-Type: text/html +Connection: close +Vary: Accept-Encoding + +Memento for 12/31/2016 11:00am + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://memento.us/ +WARC-Date: 2016-12-31T11:00:01Z +WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 +WARC-Record-ID: +Content-Type: application/http; msgtype=response +Content-Length: 187 + +HTTP/1.1 200 OK +Server: nginx +Date: Mon, 30 Jan 2017 18:39:49 GMT +Content-Type: text/html +Connection: close +Vary: Accept-Encoding + +Memento for 12/31/2016 11:01am + + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://someotherURI.us/ +WARC-Date: 2016-12-31T11:00:00Z +WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 +WARC-Record-ID: +Content-Type: application/http; msgtype=response +Content-Length: 170 + +HTTP/1.1 200 OK +Server: nginx +Date: Mon, 30 Jan 2017 18:39:49 GMT +Content-Type: text/html +Connection: close +Vary: Accept-Encoding + +SomeotherURI + + + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://anothersite.us/ +WARC-Date: 2016-12-31T11:00:00Z +WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 +WARC-Record-ID: +Content-Type: application/http; msgtype=response +Content-Length: 170 + +HTTP/1.1 200 OK +Server: nginx +Date: Mon, 30 Jan 2017 18:39:49 GMT +Content-Type: text/html +Connection: close +Vary: Accept-Encoding + +Another site + + + + + +WARC/1.0 +WARC-Type: response +WARC-Record-ID: +WARC-Warcinfo-ID: +WARC-Target-URI: http://whensAPNGNotAPing.net +WARC-Date: 2017-03-01T19:26:39Z +WARC-Block-Digest: sha1:SZPTTOGV3LYYR6H7OMA7QC6YKZACNQSY +Content-Type: image/png +Content-Length: 154 + +HTTP/1.1 200 OK +Server: nginx +Date: Mon, 30 Jan 2017 18:39:49 GMT +Content-Type: image/png +Connection: close +Vary: Accept-Encoding + +Ceci n'est pas une PNG. diff --git a/ipwb/webui/index.html b/ipwb/webui/index.html index 5084fa20..dd13e9a3 100644 --- a/ipwb/webui/index.html +++ b/ipwb/webui/index.html @@ -38,7 +38,7 @@

ipwb

From dae645833a2699ed054dedc7a0be4c39a5a1ad8a Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Tue, 3 Jul 2018 17:28:08 -0400 Subject: [PATCH 2/2] Add space for pycodestyle compliance --- ipwb/replay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipwb/replay.py b/ipwb/replay.py index 633b433b..e585d2dd 100755 --- a/ipwb/replay.py +++ b/ipwb/replay.py @@ -858,7 +858,7 @@ def retrieveMemCount(cdxjFilePath=INDEX_FILE): surtURI = l.split()[0] if surtURI not in bucket: bucket[surtURI] = 1 - else: # Unnecessary to keep count now, maybe useful later + else: # Unnecessary to keep count now, maybe useful later bucket[surtURI] += 1 return mementoCount, len(bucket.keys())