Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add JSON schema descriptions for BQ and ES #18

Merged
merged 17 commits into from
Apr 10, 2018
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 7 additions & 17 deletions zschema/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@

def usage():
sys.stderr.write("USAGE: %s command schema [file].\n" % sys.argv[0].split("/")[-1])
sys.stderr.write("Valid commands: bigquery, elasticsearch, json, text, html, censys-html, flat, validate.\n")
sys.stderr.write("schema should be defined as file.py:record\n")
sys.stderr.write("Valid commands: bigquery, elasticsearch, docs-es, docs-bq, json, flat, validate.\n")
sys.stderr.write("Schema should be passed as file.py:record\n")
sys.stderr.write("The optional 'file' argument is used only as the test file for the 'validate' command.\n")
sys.stderr.write("VERSION: %s\n" % zschema.__version__)
sys.exit(1)

Expand All @@ -27,26 +28,15 @@ def main():
print json.dumps(record.to_bigquery())
elif command == "elasticsearch":
print json.dumps(record.to_es(recname))
elif command == "docs-es":
print json.dumps(record.docs_es(recname))
elif command == "docs-bq":
print json.dumps(record.docs_bq(recname))
elif command == "json":
print record.to_json()
elif command == "html":
for r in record.to_flat():
type_ = r.get("es_type", "")
print "<tr><td>%s</td><td>%s</td></tr>" % (r["name"], type_)
elif command == "text":
print record.to_text()
elif command == "flat":
for r in record.to_flat():
print json.dumps(r)
elif command == "censys-html":
for r in record.to_flat():
type_ = r.get("es_type", None)
len_ = r["name"].count(".")
style = 'style="padding-left: %ipx"' % (15 * len_ + 5)
if not type_:
print '<tr class="record"><td %s>%s</td><td>%s</td></tr>' % (style, r["name"], "")
else:
print "<tr><td %s>%s</td><td>%s</td></tr>" % (style, r["name"], type_)
elif command == "validate":
if not os.path.exists(sys.argv[3]):
sys.stderr.write("Invalid test file. %s does not exist.\n" % sys.argv[3])
Expand Down
103 changes: 83 additions & 20 deletions zschema/compounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ def _is_valid_object(name, object_):

class ListOf(Keyable):

def __init__(self, object_, max_items=10):
def __init__(self, object_, max_items=10, doc=None, category=None):
self.object_ = object_
self.max_items = max_items
self.category = category
self.doc = doc
_is_valid_object("Anonymous ListOf", object_)

@property
Expand All @@ -33,9 +35,27 @@ def to_bigquery(self, name):
retv["mode"] = "REPEATED"
return retv

def docs_bq(self, parent_category=None):
retv = self.object_.docs_bq()
category = self.category or parent_category
retv["category"] = category
retv["repeated"] = True
if self.doc:
retv["doc"] = self.doc
return retv

def to_es(self):
return self.object_.to_es()

def docs_es(self, parent_category=None):
retv = self.object_.docs_es()
category = self.category or parent_category
retv["category"] = category
retv["repeated"] = True
if self.doc:
retv["doc"] = self.doc
return retv

def validate(self, name, value):
if type(value) != list:
raise DataValidationException("%s: %s is not a list",
Expand All @@ -59,11 +79,13 @@ def __init__(self,
doc=None,
extends=None,
allow_unknown=False,
exclude=None):
exclude=None,
category=None):
self.definition = definition
self.required = required
self.allow_unknown = allow_unknown
self.doc = doc
self.category = category
self._exclude = set(exclude) if exclude else set([])
# merge
if extends:
Expand Down Expand Up @@ -113,14 +135,25 @@ def merge(self, other):
return self

def to_bigquery(self, name):
fields = [v.to_bigquery(k) for (k,v) in sorted(self.definition.iteritems()) if \
not v.exclude_bigquery]
return {
fields = [v.to_bigquery(k) \
for (k,v) in sorted(self.definition.iteritems()) \
if not v.exclude_bigquery
]
retv = {
"name":self.key_to_bq(name),
"type":"RECORD",
"fields":fields,
"mode":"REQUIRED" if self.required else "NULLABLE"
}
return retv

def docs_bq(self, parent_category=None):
retv = self._docs_common(parent_category=parent_category)
fields = { self.key_to_bq(k): v.docs_bq() \
for (k,v) in sorted(self.definition.iteritems()) \
if not v.exclude_bigquery }
retv["fields"] = fields
return retv

def print_indent_string(self, name, indent):
tabs = "\t" * indent if indent else ""
Expand All @@ -129,10 +162,28 @@ def print_indent_string(self, name, indent):
value.print_indent_string(name, indent+1)

def to_es(self):
p = {self.key_to_es(k): v.to_es() for k, v in sorted(self.definition.iteritems()) \
p = {self.key_to_es(k): v.to_es() \
for k, v in sorted(self.definition.iteritems()) \
if not v.exclude_elasticsearch}
return {"properties": p}

def _docs_common(self, parent_category):
category = self.category or parent_category
retv = {
"category": category,
"doc": self.doc,
"type": self.__class__.__name__,
Copy link
Contributor

@justinbastress justinbastress Apr 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems it would be nice if this could be overridden...

  • __class__.__name__ doesn't keep package information (?)
  • Constrains our type names to match python class name rules (and constrains our class names to match our type names rules?)

Beyond the scope of this PR, but I just noticed it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems it would be nice if this could be overridden

Beyond the scope of this PR, but I just noticed it.

Good point, @justinbastress. It looks like coupling to __class__.__name__ has been the idiom for a while (e.g., an initial commit for the CLI), but an area for future improvement.

"required": self.required,
}
return retv

def docs_es(self, parent_category=None):
retv = self._docs_common(parent_category=parent_category)
retv["fields"] = { self.key_to_es(k): v.docs_es() \
for k, v in sorted(self.definition.iteritems()) \
if not v.exclude_elasticsearch }
return retv

def to_dict(self):
source = sorted(self.definition.iteritems())
p = {self.key_to_es(k): v.to_dict() for k, v in source}
Expand All @@ -152,8 +203,8 @@ def validate(self, name, value):

class NestedListOf(ListOf):

def __init__(self, object_, subrecord_name, max_items=10):
ListOf.__init__(self, object_, max_items)
def __init__(self, object_, subrecord_name, max_items=10, doc=None, category=None):
ListOf.__init__(self, object_, max_items, doc=doc, category=category)
self.subrecord_name = subrecord_name

def to_bigquery(self, name):
Expand All @@ -162,6 +213,19 @@ def to_bigquery(self, name):
})
retv = subr.to_bigquery(self.key_to_bq(name))
retv["mode"] = "REPEATED"
if self.doc:
retv["doc"] = self.doc
return retv

def docs_bq(self, parent_category=None):
subr = SubRecord({
self.subrecord_name: ListOf(self.object_)
})
category = self.category or parent_category
retv = subr.docs_bq(parent_category=category)
retv["repeated"] = True
if self.doc:
retv["doc"] = self.doc
return retv


Expand All @@ -170,24 +234,25 @@ class Record(SubRecord):
def to_es(self, name):
return {name:SubRecord.to_es(self)}

def docs_es(self, name, parent_category=None):
category = self.category or parent_category
return {name: SubRecord.docs_es(self, parent_category=category)}

def to_bigquery(self):
source = sorted(self.definition.iteritems())
return [s.to_bigquery(name) for (name, s) in source \
if not s.exclude_bigquery]

def to_html(self):
pass
return [s.to_bigquery(name) \
for (name, s) in source \
if not s.exclude_bigquery
]

def to_documented_html(self):
pass
def docs_bq(self, name, parent_category=None):
category = self.category or parent_category
return {name: SubRecord.docs_bq(self, parent_category=category)}

def print_indent_string(self):
for name, field in sorted(self.definition.iteritems()):
field.print_indent_string(name, 0)

def to_dotted_text(self):
pass

def validate(self, value):
if type(value) != dict:
raise DataValidationException("record is not a dict", str(value))
Expand All @@ -212,5 +277,3 @@ def to_flat(self):
@classmethod
def from_json(cls, j):
return cls({(k, __encode(v)) for k, v in sorted(j.iteritems())})


41 changes: 30 additions & 11 deletions zschema/leaves.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@ def __init__(self,
es_index=None,
es_analyzer=None,
doc=None,
examples=None,
es_include_raw=None,
deprecated=False,
ignore=False,
autocomplete_include=True,
autocomplete_category=None,
autocomplete_icon=None,
category=None,
exclude=None,
metadata=None,
units=None,
Expand All @@ -30,6 +29,7 @@ def __init__(self,
self.es_index = es_index
self.es_analyzer = es_analyzer
self.doc = doc
self.examples = examples if examples else []
if es_include_raw is not None:
self.es_include_raw = es_include_raw
else:
Expand All @@ -40,9 +40,7 @@ def __init__(self,
e = "WARN: %s is deprecated and will be removed in a "\
"future release\n" % self.__class__.__name__
sys.stderr.write(e)
self.autocomplete_category = autocomplete_category
self.autocomplete_category = autocomplete_category
self.autocomplete_icon = autocomplete_icon
self.category = category
self._exclude = set(exclude) if exclude else set([])
self.metadata = metadata if metadata else {}
self.units = units
Expand All @@ -56,7 +54,8 @@ def to_dict(self):
"type":self.__class__.__name__,
"es_type":self.ES_TYPE,
"bq_type":self.BQ_TYPE,
"metadata":self.metadata
"metadata":self.metadata,
"examples": self.examples,
}
if self.units is not None:
retv["units"] = self.units
Expand All @@ -72,13 +71,36 @@ def to_es(self):
self.add_es_var(retv, "analyzer", "es_analyzer", "ES_ANALYZER")
self.add_es_var(retv, "search_analyzer", "es_search_analyzer",
"ES_SEARCH_ANALYZER")

if self.es_include_raw:
retv["fields"] = {
"raw":{"type":"keyword"}
}
return retv

def _docs_common(self, parent_category):
retv = {
"detail_type": self.__class__.__name__,
"category": self.category or parent_category,
"doc": self.doc,
"required": self.required,
}
if hasattr(self, "values_s") and len(self.values_s):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are retv["values"] and retv["examples"] mutually exclusive?

It would be nice if the values could be individually documented (something like "Algorithms": Enum(documented_values={"value1": "docs for value1"}), except more natural)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It se ms reasonable to let different types define their own type of '.doc' property. The default could be to just hand back._doc but could also do something else like compilea list of enumerated values into a doc string.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are retv["values"] and retv["examples"] mutually exclusive?

If there is an exhaustive list of possible values, I see no reason to support a list of example values as well.

It would be nice if the values could be individually documented (something like "Algorithms": Enum(documented_values={"value1": "docs for value1"}), except more natural)

Maybe, though I'm unconvinced this is worth the work to implement. The two examples of enums I recall offhand in our schema are elliptic curves and certificate types (leaf/intermediate/root), both of which seem plenty self-explanatory given the existence of a docstring for the field and a list of possible values.

It se ms reasonable to let different types define their own type of '.doc' property. The default could be to just hand back._doc but could also do something else like compilea list of enumerated values into a doc string.

@zakird I'm not sure I follow what you're suggesting here. As of this PR, types can have a docstring and a list of example values (or a list of possible values, for enums). What are you suggesting should change?

retv["values"] = list(self.values_s)
else:
retv["examples"] = self.examples
return retv

def docs_es(self, parent_category=None):
retv = self._docs_common(parent_category)
self.add_es_var(retv, "analyzer", "es_analyzer", "ES_ANALYZER")
retv["type"] = self.ES_TYPE
return retv

def docs_bq(self, parent_category=None):
retv = self._docs_common(parent_category)
retv["type"] = self.BQ_TYPE
return retv

def to_bigquery(self, name):
if not self._check_valid_name(name):
raise Exception("Invalid field name: %s" % name)
Expand Down Expand Up @@ -118,9 +140,6 @@ def to_flat(self, parent, name, repeated=False):
"mode":mode
}

def to_autocomplete(self, parent, name, repated=False):
pass

def print_indent_string(self, name, indent):
val = self.key_to_string(name)
if indent:
Expand Down