Skip to content

Commit

Permalink
Add start of cdxj validation for issue #86
Browse files Browse the repository at this point in the history
  • Loading branch information
machawk1 committed Jan 22, 2017
1 parent 16847f4 commit 8625796
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 1 deletion.
1 change: 1 addition & 0 deletions invalid.cdxj
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
edu,odu,cs)/~salam 20160305192247 {"locator": "urn:ipfs/QmeVWGtnfuJ1QnpmtKKnyArVgEpq7v31kktEfh6c8mDiXE/QmZWKQRBNXNrVZ69LoGpMNJi5NU66gDhnGtQukWJepv7Kr", "encryption_method": "xor", "encryption_key": "radon", "mime_type": "text/html", "status_code": "200"}
6 changes: 6 additions & 0 deletions ipwb/replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,9 @@ def getIndexFileFullPath(cdxjFilePath=INDEX_FILE):

def getURIsInCDXJ(cdxjFilePath=INDEX_FILE):
indexFileContents = getIndexFileContents(cdxjFilePath)
if not ipwbConfig.isValidCDXJ(indexFileContents):
print "Invalid CDXJ line!"
return 0

if not indexFileContents:
return 0
Expand All @@ -238,6 +241,9 @@ def getURIsInCDXJ(cdxjFilePath=INDEX_FILE):
def retrieveMemCount(cdxjFilePath=INDEX_FILE):
print "Retrieving URI-Ms from {0}".format(cdxjFilePath)
indexFileContents = getIndexFileContents(cdxjFilePath)
if not ipwbConfig.isValidCDXJ(indexFileContents):
print "Invalid CDXJ line, cannot get Memento count"
return 0

if not indexFileContents:
return 0
Expand Down
26 changes: 25 additions & 1 deletion ipwb/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import subprocess
import site
# from requests.exceptions import ConnectionError
from pywb.utils.canonicalize import unsurt
from ipfsapi.exceptions import ConnectionError


Expand Down Expand Up @@ -43,9 +44,31 @@ def isDaemonAlive(hostAndPort="{0}:{1}".format(IPFSAPI_IP, IPFSAPI_PORT)):
print sys.exc_info()[0]


class InvalidCDXJException(Exception):
pass

def isValidCDXJ(stringIn): # TODO: Check specific strict syntax
# Also, be sure to mind the meta headers starting with @/#, etc.
return True
try:
metadataAllowed = True
lines = stringIn.split('\n')
for line in lines:
if line[0] == '@' or line[1] == '!':
if metadataAllowed:
continue
else:
raise InvalidCDXJException()
metadataAllowed = False
(uri, datetime, cdxjJSON) = line.split(' ',2)
unsurt(uri) # Will throw an exception if invalid
# SURT ok
if not datetime.isdigit() or len(datetime) != 14:
raise InvalidCDXJException()
json.loads(cdxjJSON)
return True
except:
print sys.exc_info()[0]
return False


def getURIsInCDXJ(cdxjFile=INDEX_FILE):
Expand Down Expand Up @@ -158,6 +181,7 @@ def getIPWBReplayIndexPath():

def firstRun():
import indexer

# Ensure the sample WARC is in IPFS
print 'Executing first-run procedure on provided sample data.'
indexer.indexFileAt(site.getsitepackages()[0] + '/ipwb/' + SAMPLE_WARC,
Expand Down
19 changes: 19 additions & 0 deletions tests/test_replay.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import pytest
from ipwb import util

import testUtil as ipwbTest

# Successful retrieval
# Accurate retrieval
Expand Down Expand Up @@ -70,4 +73,20 @@ def test_fileImport_nonCDXJ(): # Fail w/ friendly message when non-cdxj
pass


@pytest.mark.cdxjValidation
def test_cdxj_valid():
# Missing fields
assert not util.isValidCDXJ('test')
# Valid SURT
assert util.isValidCDXJ(r"""edu,odu,cs)/~salam 20160305192247 {"locator": "urn:ipfs/QmeVWGtnfuJ1QnpmtKKnyArVgEpq7v31kktEfh6c8mDiXE/QmZWKQRBNXNrVZ69LoGpMNJi5NU66gDhnGtQukWJepv7Kr", "encryption_method": "xor", "encryption_key": "radon", "mime_type": "text/html", "status_code": "200"}""")
# Bad JSON in third field
assert not util.isValidCDXJ(r"""edu,odu,cs)/ 20160305192247 radon""")
# Valid SURT
assert util.isValidCDXJ(r"""edu,odu,cs)/ 20160305192247 {}""")
#Invalid datetime
assert not util.isValidCDXJ(r"""edu,odu,cs)/ 2016030519224 {}""")
# Invalid SURT URI, pywb catches its own ValueError
#assert not util.isValidCDXJ(r"""foo.bar 20160305192247 {}""")


# TODO: Have unit tests for each function in replay.py

0 comments on commit 8625796

Please sign in to comment.