Skip to content

Commit

Permalink
scanners: begin port to re2c
Browse files Browse the repository at this point in the history
  • Loading branch information
kivikakk committed Mar 25, 2023
1 parent 8e6f193 commit 3f07025
Show file tree
Hide file tree
Showing 4 changed files with 2,166 additions and 25 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
src/scanners.rs linguist-generated
src/scanners.re linguist-language=Rust
5 changes: 2 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
docker:
docker build -t comrak $(CURDIR)/script
docker run --privileged -t -i -v $(CURDIR):/src/comrak -v $(HOME)/.cargo/registry:/root/.cargo/registry -w /src/comrak comrak /bin/bash
src/scanners.rs: src/scanners.re
re2rust -W -Werror --case-insensitive -i --no-generation-date -8 --encoding-policy substitute -o $@ $<

bench:
cargo build --release
Expand Down
234 changes: 234 additions & 0 deletions src/scanners.re
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
use memchr::memmem;
use std::str;
use pest::Parser;
use pest_derive::Parser;

#[cfg(debug_assertions)]
const _LEXER: &str = include_str!("lexer.pest");

#[derive(Parser)]
#[grammar = "lexer.pest"]
struct Lexer;

#[inline(always)]
fn search(rule: Rule, line: &[u8]) -> Option<usize> {
if let Ok(pairs) = Lexer::parse(rule, unsafe { str::from_utf8_unchecked(line) }) {
Some(pairs.last().unwrap().as_span().end())
} else {
None
}
}
#[inline(always)]
fn is_match(rule: Rule, line: &[u8]) -> bool {
Lexer::parse(rule, unsafe { str::from_utf8_unchecked(line) }).is_ok()
}

// TODO: consider dropping all the #[inline(always)], we probably don't know
// better than rustc.

/*!re2c
re2c:define:YYCTYPE = u8;
re2c:define:YYPEEK = "*s.get_unchecked(cursor)";
re2c:define:YYSKIP = "cursor += 1;";
re2c:define:YYBACKUP = "marker = cursor;";
re2c:define:YYRESTORE = "cursor = marker;";
re2c:yyfill:enable = 0;
*/

#[inline(always)]
pub fn atx_heading_start(s: &[u8]) -> Option<usize> {
let mut cursor = 0;
let mut marker = 0;
/*!re2c
[#]{1,6} ([ \t]+|[\r\n]) { return Some(cursor); }
* { return None; }
*/
}

#[inline(always)]
pub fn html_block_end_1(s: &[u8]) -> bool {
let mut cursor = 0;
let mut marker = 0;
/*!re2c
[^\n\x00]* [<] [/] ('script'|'pre'|'textarea'|'style') [>] { return true; }
* { return false; }
*/
}

#[inline(always)]
pub fn html_block_end_2(line: &[u8]) -> bool {
memmem::find(line, b"-->").is_some()
}

#[inline(always)]
pub fn html_block_end_3(line: &[u8]) -> bool {
memmem::find(line, b"?>").is_some()
}

#[inline(always)]
pub fn html_block_end_4(line: &[u8]) -> bool {
line.contains(&b'>')
}

#[inline(always)]
pub fn html_block_end_5(line: &[u8]) -> bool {
memmem::find(line, b"]]>").is_some()
}

#[inline(always)]
pub fn open_code_fence(line: &[u8]) -> Option<usize> {
if line[0] != b'`' && line[0] != b'~' {
return None;
}
search(Rule::open_code_fence, line)
}

#[inline(always)]
pub fn close_code_fence(line: &[u8]) -> Option<usize> {
if line[0] != b'`' && line[0] != b'~' {
return None;
}
search(Rule::close_code_fence, line)
}

#[inline(always)]
pub fn html_block_start(line: &[u8]) -> Option<usize> {
const STR2: &'static [u8] = b"<!--";
const STR3: &'static [u8] = b"<?";
const STR5: &'static [u8] = b"<![CDATA[";

if !line.starts_with(b"<") {
return None;
}

if is_match(Rule::html_block_start_1, line) {
Some(1)
} else if line.starts_with(STR2) {
Some(2)
} else if line.starts_with(STR3) {
Some(3)
} else if is_match(Rule::html_block_start_4, line) {
Some(4)
} else if line.starts_with(STR5) {
Some(5)
} else if is_match(Rule::html_block_start_6, line) {
Some(6)
} else {
None
}
}

#[inline(always)]
pub fn html_block_start_7(line: &[u8]) -> Option<usize> {
if is_match(Rule::html_block_start_7, line) {
Some(7)
} else {
None
}
}

pub enum SetextChar {
Equals,
Hyphen,
}

#[inline(always)]
pub fn setext_heading_line(line: &[u8]) -> Option<SetextChar> {
if (line[0] == b'=' || line[0] == b'-') && is_match(Rule::setext_heading_line, line) {
if line[0] == b'=' {
Some(SetextChar::Equals)
} else {
Some(SetextChar::Hyphen)
}
} else {
None
}
}

#[inline(always)]
pub fn footnote_definition(line: &[u8]) -> Option<usize> {
search(Rule::footnote_definition, line)
}

#[inline(always)]
pub fn scheme(line: &[u8]) -> Option<usize> {
search(Rule::scheme_rule, line)
}

#[inline(always)]
pub fn autolink_uri(line: &[u8]) -> Option<usize> {
search(Rule::autolink_uri, line)
}

#[inline(always)]
pub fn autolink_email(line: &[u8]) -> Option<usize> {
search(Rule::autolink_email, line)
}

#[inline(always)]
pub fn html_tag(line: &[u8]) -> Option<usize> {
search(Rule::html_tag, line)
}

#[inline(always)]
pub fn html_comment(line: &[u8]) -> Option<usize> {
search(Rule::html_comment, line)
}

#[inline(always)]
pub fn html_processing_instruction(line: &[u8]) -> Option<usize> {
search(Rule::html_processing_instruction, line)
}

#[inline(always)]
pub fn html_declaration(line: &[u8]) -> Option<usize> {
search(Rule::html_declaration, line)
}

#[inline(always)]
pub fn html_cdata(line: &[u8]) -> Option<usize> {
search(Rule::html_cdata, line)
}

#[inline(always)]
pub fn spacechars(line: &[u8]) -> Option<usize> {
search(Rule::spacechars, line)
}

#[inline(always)]
pub fn link_title(line: &[u8]) -> Option<usize> {
search(Rule::link_title, line)
}

#[cfg(feature = "shortcodes")]
#[inline(always)]
pub fn shortcode(line: &[u8]) -> Option<usize> {
search(Rule::shortcode_rule, line)
}

#[inline(always)]
pub fn table_start(line: &[u8]) -> Option<usize> {
search(Rule::table_start, line)
}

#[inline(always)]
pub fn table_cell(line: &[u8]) -> Option<usize> {
search(Rule::table_cell, line)
}

#[inline(always)]
pub fn table_cell_end(line: &[u8]) -> Option<usize> {
search(Rule::table_cell_end, line)
}

#[inline(always)]
pub fn table_row_end(line: &[u8]) -> Option<usize> {
search(Rule::table_row_end, line)
}

#[inline(always)]
pub fn dangerous_url(line: &[u8]) -> Option<usize> {
search(Rule::dangerous_url, line)
}

// vim: set ft=rust:
Loading

0 comments on commit 3f07025

Please sign in to comment.