Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

regexp refactor #10

Merged
merged 4 commits into from
Feb 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ linters:
- gosec
- prealloc
- funlen
- lll

issues:
exclude-use-default: false
196 changes: 48 additions & 148 deletions format.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@ package main

import (
"bytes"
"errors"
"go/ast"
"go/format"
"go/parser"
"go/token"
"regexp"
"strings"
"unicode"
"unicode/utf8"

"golang.org/x/tools/go/ast/astutil"
)
Expand All @@ -23,18 +22,8 @@ func Format(src []byte) ([]byte, error) {
return nil, err
}

// Apply transform.
transformed := CommentTransform(f, func(text string) string {
newtext, errf := formula(text)
if errf != nil {
err = errf
return text
}
return newtext
})
if err != nil {
return nil, err
}
// Process every comment as a formula.
transformed := commentreplace(f, formula)

// Format.
buf := bytes.NewBuffer(nil)
Expand All @@ -44,19 +33,19 @@ func Format(src []byte) ([]byte, error) {
return buf.Bytes(), nil
}

// CommentTransform applies transform to the text of every comment under the root AST.
func CommentTransform(root ast.Node, transform func(string) string) ast.Node {
// commentreplace applies repl function to the text of every comment under the root AST.
func commentreplace(root ast.Node, repl func(string) string) ast.Node {
return astutil.Apply(root, func(c *astutil.Cursor) bool {
switch n := c.Node().(type) {
case *ast.Comment:
c.Replace(&ast.Comment{
Slash: n.Slash,
Text: transform(n.Text),
Text: repl(n.Text),
})
case *ast.File:
for _, g := range n.Comments {
for _, comment := range g.List {
comment.Text = transform(comment.Text)
comment.Text = repl(comment.Text)
}
}
}
Expand All @@ -66,9 +55,16 @@ func CommentTransform(root ast.Node, transform func(string) string) ast.Node {

// Fixed data structures required for formula processing.
var (
replacer *strings.Replacer // replacer for symbols.
super = map[rune]rune{} // replacement map for superscript characters.
sub = map[rune]rune{} // replacement map for subscript characters.
// Symbol replacer.
replacer *strings.Replacer

// Regular expressions for super/subscripts.
supregexp *regexp.Regexp
subregexp *regexp.Regexp

// Rune replacement maps.
super = map[rune]rune{}
sub = map[rune]rune{}
)

func init() {
Expand All @@ -79,151 +75,55 @@ func init() {
}
replacer = strings.NewReplacer(oldnew...)

// Build super/subscript replacement maps.
// Build super/subscript character classes and replacement maps.
var superclass, subclass []rune
for _, char := range chars {
if char.Super != None {
superclass = append(superclass, char.Char)
super[char.Char] = char.Super
}
if char.Sub != None {
subclass = append(subclass, char.Char)
sub[char.Char] = char.Sub
}
}
}

// formula processes a formula in s, writing the result to w.
func formula(s string) (string, error) {
if len(s) == 0 {
return "", nil
}

// Replace symbols.
s = replacer.Replace(s)

// Replace super/subscripts.
buf := bytes.NewBuffer(nil)
last := None
for len(s) > 0 {
r, size := utf8.DecodeRuneInString(s)

// Look for a super/subscript character.
var repl map[rune]rune
switch r {
case '^':
repl = super
case '_':
repl = sub
default:
buf.WriteRune(r)
last = r
s = s[size:]
continue
}

// Perform replacement.
if unicode.IsPrint(last) && !unicode.IsSpace(last) {
var err error
s, err = supsub(buf, s, repl)
if err != nil {
return "", err
}
} else {
buf.WriteRune(r)
s = s[size:]
}

last = None
}

return buf.String(), nil
// Build regular expressions.
supregexp = regexp.MustCompile(`(\b[A-Za-z0-9]|\pS)\^(\d+|\{` + charclass(superclass) + `+\}|` + charclass(superclass) + `\s)`)
subregexp = regexp.MustCompile(`(\b[A-Za-z]|\pS)_(\d+\b|\{` + charclass(subclass) + `+\})`)
}

// supsub processes a super/subscript starting at s, writing the result to w.
// The repl map provides the mapping from runes to the corresponding
// super/subscripted versions. Note the first character of s should be the "^"
// or "_" operator.
func supsub(w *bytes.Buffer, s string, repl map[rune]rune) (string, error) {
arg, rest, err := parsearg(s[1:])
if err != nil {
return "", err
}

// If we could not parse an argument, or its not replaceable, just write the
// sub/script operator and return.
if len(arg) == 0 || !replaceable(arg, repl) {
w.WriteByte(s[0])
return s[1:], nil
}

// Perform the replacement.
for _, r := range arg {
w.WriteRune(repl[r])
}

return rest, nil
// charclass builds a regular expression character class from a list of runes.
func charclass(runes []rune) string {
return strings.ReplaceAll("["+string(runes)+"]", "-", `\-`)
}

// parsearg parses the argument to a super/subscript.
func parsearg(s string) (string, string, error) {
if len(s) == 0 {
return "", "", nil
}

// Braced.
if s[0] == '{' {
arg, rest, err := parsebraces(s)
if err != nil {
return "", "", err
}
return arg[1 : len(arg)-1], rest, nil
}

// Look for a numeral.
i := 0
for ; i < len(s) && '0' <= s[i] && s[i] <= '9'; i++ {
}
if i > 0 {
return s[:i], s[i:], nil
}

// Default to the first rune.
_, i = utf8.DecodeRuneInString(s)
return s[:i], s[i:], nil
}

// parsebraces parses matching braces starting at the beginning of s.
func parsebraces(s string) (string, string, error) {
if len(s) == 0 || s[0] != '{' {
return "", "", errors.New("expected {")
}

depth := 0
for i, r := range s {
// Adjust depth if we see open or close brace.
switch r {
case '{':
depth++
case '}':
depth--
}
// formula processes a formula in s, writing the result to w.
func formula(s string) string {
// Replace symbols.
s = replacer.Replace(s)

// Continue if we have not reached matched braces.
if depth > 0 {
continue
}
// Replace superscripts.
s = supregexp.ReplaceAllStringFunc(s, subsupreplacer(super))

// Return the matched braces.
return s[:i+1], s[i+1:], nil
}
// Replace subscripts.
s = subregexp.ReplaceAllStringFunc(s, subsupreplacer(sub))

return "", "", errors.New("unmatched braces")
return s
}

// replaceable returns whether every rune in s has a replacement in repl.
func replaceable(s string, repl map[rune]rune) bool {
for _, r := range s {
if _, ok := repl[r]; !ok {
return false
// subsupreplacer builds a replacement function that applies the repl rune map
// to a matched super/subscript.
func subsupreplacer(repl map[rune]rune) func(string) string {
return func(s string) string {
var runes []rune
for i, r := range s {
if i == 0 || unicode.IsSpace(r) {
runes = append(runes, r)
} else if repl[r] != None {
runes = append(runes, repl[r])
}
}
return string(runes)
}
return true
}
79 changes: 58 additions & 21 deletions format_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,31 @@ func TestFormula(t *testing.T) {

// Symbols.
{Name: "basic_symbol", Input: "x +- y", Expect: "x ± y"},
{Name: "basic_latex_symbol", Input: "x \\oplus y", Expect: "x ⊕ y"},
{Name: "basic_latex_symbol", Input: `x \oplus y`, Expect: "x ⊕ y"},
{Name: "multi_symbols", Input: "2 <= x <= 10", Expect: "2 ⩽ x ⩽ 10"},

// Super/subscripts.
{Name: "sup_brace_replaceable", Input: "x^{i+j}ab", Expect: "xⁱ⁺ʲab"},
{Name: "sup_numeral_replaceable", Input: "x^123a", Expect: "x¹²³a"},
{Name: "sup_char_replaceable", Input: "x^ijk", Expect: "xⁱjk"},
{Name: "sup_char_replaceable", Input: "x^ijk", Expect: "x^ijk"},

{Name: "sup_brace_nonreplaceable", Input: "x^{p+q}pq", Expect: "x^{p+q}pq"},
{Name: "sup_char_nonreplaceable", Input: "x^qrs", Expect: "x^qrs"},

{Name: "sub_brace_replaceable", Input: "x_{i+j}ab", Expect: "xᵢ₊ⱼab"},
{Name: "sub_numeral_replaceable", Input: "x_123a", Expect: "x₁₂₃a"},
{Name: "sub_char_replaceable", Input: "x_ijk", Expect: "xᵢjk"},
{Name: "sub_digit_brace_replaceable", Input: "2_{i+j}ab", Expect: "2_{i+j}ab"},
{Name: "sub_numeral_boundary_replaceable", Input: "x_123 a", Expect: "x₁₂₃ a"},
{Name: "sub_numeral_non_boundary", Input: "x_123a", Expect: "x_123a"},
{Name: "sub_char_replaceable", Input: "x_ijk", Expect: "x_ijk"},

{Name: "sub_brace_nonreplaceable", Input: "x_{w+x}wx", Expect: "x_{w+x}wx"},
{Name: "sub_char_nonreplaceable", Input: "x_wxy", Expect: "x_wxy"},

// Combination.
{Name: "sup_with_symbol", Input: "\\oplus^23", Expect: "⊕²³"},
{Name: "sub_with_symbol", Input: "\\oplus_23", Expect: "⊕₂₃"},
// Combination of symbols and super/subscripts.
{Name: "sup_with_symbol", Input: `\oplus^23`, Expect: "⊕²³"},
{Name: "sub_with_symbol", Input: `\oplus_23`, Expect: "⊕₂₃"},
{Name: "sup_brace_with_symbol", Input: `\oplus^{i+j}`, Expect: "⊕ⁱ⁺ʲ"},
{Name: "sub_brace_with_symbol", Input: `\oplus_{i+j}`, Expect: "⊕ᵢ₊ⱼ"},

// Malformed.
{Name: "sup_first_char", Input: "^a", Expect: "^a"},
Expand All @@ -48,25 +52,58 @@ func TestFormula(t *testing.T) {
{Name: "sup_space_before", Input: "pre ^a", Expect: "pre ^a"},
{Name: "sub_space_before", Input: "pre _a", Expect: "pre _a"},

{Name: "sup_consecutive", Input: "pre ^^^^^^^a post", Expect: "pre ^^^^^^^a post"},
{Name: "sub_consecutive", Input: "pre _______a post", Expect: "pre _______a post"},

// Regression.
{Name: "sup_with_minus", Input: "2^32-1", Expect: "2³²-1"},
{
Name: "sup_with_minus",
Input: "2^32-1",
Expect: "2³²-1",
},
{
Name: "exp_with_minus",
Input: "p256Invert calculates |out| = |in|^{-1}",
Expect: "p256Invert calculates |out| = |in|⁻¹",
},
}
for _, c := range cases {
c := c // scopelint
t.Run(c.Name, func(t *testing.T) {
got, err := formula(c.Input)
if err != nil {
t.Fatal(err)
}
if got != c.Expect {
t.Logf("input = %q", c.Input)
t.Logf("got = %q", got)
t.Logf("expect = %q", c.Expect)
t.FailNow()
}
AssertFormulaOutput(t, c.Input, c.Expect)
})
}
}

func TestFormulaNoChange(t *testing.T) {
// Regression tests for inputs that should have been left alone.
cases := []string{
// golang.org/x/crypto
"\"_acme-challenge\" name of the domain being validated.", // subscript "_a"
"echo -n cert | base64 | tr -d '=' | tr '/+' '_-'", // subscript "_-"
"thumbprint is precomputed for testKeyEC in jws_test.go", // subscript "_t"
"The \"signature_algorithms\" extension, if present, limits the key exchange", // subscript "_a"
"testGetCertificate_tokenCache tests the fallback of token certificate fetches", // subscript "_t"
"https://en.wikipedia.org/wiki/Automated_Certificate_Management_Environment#CAs_&_PKIs_that_offer_ACME_certificates", // subscripts in URL
"g8TuAS9g5zhq8ELQ3kmjr-KV86GAMgI6VAcGlq3QrzpTCf_30Ab7-zawrfRaFON", // subscript "_30"
"JAumQ_I2fjj98_97mk3ihOY4AgVdCDj1z_GCoZkG5Rq7nbCGyosyKWyDX00Zs-n", // subscript "_97"
"xiToPMinus1Over3 is ξ^((p-1)/3) where ξ = i+3.", // superscript "^("
"FrobeniusP2 computes (xτ²+yτ+z)^(p²) = xτ^(2p²) + yτ^(p²) + z", // superscript "^("
"x for a moment, then after applying the Frobenius, we have x̄ω^(2p)", // superscript "^("
"x̄ξ^((p-1)/3)ω² and applying the inverse isomorphism eliminates the", // superscript "^("
"be called when the vector facility is available. Implementation in asm_s390x.s.", // subscript "_s"
"[1] http://csrc.nist.gov/publications/drafts/fips-202/fips_202_draft.pdf", // subscript "_202"
"Cert generated by ssh-keygen OpenSSH_6.8p1 OS X 10.10.3", // subscript "_6"
}
for _, input := range cases {
AssertFormulaOutput(t, input, input)
}
}

func AssertFormulaOutput(t *testing.T, input, expect string) {
t.Helper()
got := formula(input)
if got != expect {
t.Logf("input = %q", input)
t.Logf("got = %q", got)
t.Logf("expect = %q", expect)
t.Fail()
}
}
2 changes: 1 addition & 1 deletion testdata/poly1305.in
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ func shiftRightBy2(a uint128) uint128 {
// updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
// 128 bits of message, it computes
//
// h_+ = (h + m) * r mod 2^130 - 5
// h_{+} = (h + m) * r mod 2^130 - 5
//
// If the msg length is not a multiple of TagSize, it assumes the last
// incomplete chunk is the final one.
Expand Down