Fix parsing issues with line continuations, macro detection, and comm…

…ent handling. (#978, #981, #985)
ThrowTheSwitch · Jan 15, 2025 · 95013ba · 95013ba
1 parent b4f0ad3
commit 95013ba
Show file tree

Hide file tree

Showing 7 changed files with 227 additions and 123 deletions.
diff --git a/lib/ceedling/objects.yml b/lib/ceedling/objects.yml
@@ -162,11 +162,14 @@ file_finder:
 file_finder_helper:
   compose: loginator
 
+parsing_parcels:
+
 test_context_extractor:
   compose:
     - configurator
     - file_wrapper
     - loginator
+    - parsing_parcels
 
 include_pathinator:
   compose:
@@ -287,6 +290,8 @@ preprocessinator_file_handler:
     - loginator
 
 preprocessinator_extractor:
+  compose:
+    - parsing_parcels
 
 build_batchinator:
   compose:

diff --git a/lib/ceedling/parsing_parcels.rb b/lib/ceedling/parsing_parcels.rb
@@ -0,0 +1,77 @@
+# =========================================================================
+#   Ceedling - Test-Centered Build System for C
+#   ThrowTheSwitch.org
+#   Copyright (c) 2010-25 Mike Karlesky, Mark VanderVoord, & Greg Williams
+#   SPDX-License-Identifier: MIT
+# =========================================================================
+
+require 'ceedling/encodinator'
+
+# This is a collection of parsing aids to be used in other modules
+class ParsingParcels
+
+  # This parser accepts a collection of lines which it will sweep through and tidy, giving the purified
+  # lines to the block (one line at a time) for further analysis. It analyzes a single line at a time, 
+  # which is far more memory efficient and faster for large files. However, this requires it to also 
+  # handle backslash line continuations as a single line at this point.
+  def code_lines(input)
+    comment_block = false
+    full_line = ''
+    input.each_line do |line|
+        m = line.match /(.*)\\\s*$/
+      if (!m.nil?)
+          full_line += m[1]
+      elsif full_line.empty?
+        _line, comment_block = clean_code_line( line, comment_block )
+        yield( _line )
+      else
+        _line, comment_block = clean_code_line( full_line + line, comment_block )
+        yield( _line )
+        full_line = ''
+      end
+    end    
+  end
+
+  private ######################################################################
+
+  def clean_code_line(line, comment_block)
+    _line = line.clean_encoding
+
+    # Remove line comments
+    _line.gsub!(/\/\/.*$/, '')
+
+    # Handle end of previously begun comment block
+    if comment_block
+      if _line.include?( '*/' )
+        # Turn off comment block handling state
+        comment_block = false
+
+        # Remove everything up to end of comment block
+        _line.gsub!(/^.*\*\//, '')
+      else
+        # Ignore contents of the line if its entirely within a comment block
+        return '', comment_block        
+      end
+
+    end
+
+    # Block comments inside a C string are valid C, but we remove to simplify other parsing.
+    # No code we care about will be inside a C string.
+    # Note that we're not attempting the complex case of multiline string enclosed comment blocks
+    _line.gsub!(/"\s*\/\*.*"/, '')
+
+    # Remove single-line block comments
+    _line.gsub!(/\/\*.*\*\//, '')
+
+    # Handle beginning of any remaining multiline comment block
+    if _line.include?( '/*' )
+      comment_block = true
+
+      # Remove beginning of block comment
+      _line.gsub!(/\/\*.*/, '')
+    end
+
+    return _line, comment_block
+  end
+
+end
diff --git a/lib/ceedling/preprocessinator_extractor.rb b/lib/ceedling/preprocessinator_extractor.rb
@@ -7,9 +7,12 @@
 
 require 'ceedling/constants'
 require 'ceedling/encodinator'
+require 'ceedling/parsing_parcels'
 
 class PreprocessinatorExtractor 
 
+  constructor :parsing_parcels
+
   ##
   ## Preprocessor Expansion Output Handling
   ## ======================================
@@ -138,8 +141,8 @@ def extract_test_directive_macro_calls(file_contents)
     # Look for TEST_SOURCE_FILE("...") and TEST_INCLUDE_PATH("...") in a string (i.e. a file's contents as a string)
 
     regexes = [
-      /#{UNITY_TEST_SOURCE_FILE}.+?"\)/,
-      /#{UNITY_TEST_INCLUDE_PATH}.+?"\)/
+      /#{UNITY_TEST_SOURCE_FILE}\(\s*\"\s*[^"]+\s*\"\s*\)/,
+      /#{UNITY_TEST_INCLUDE_PATH}\(\s*\"\s*[^"]+\s*\"\s*\)/
     ]
 
     return extract_tokens_by_regex_list( file_contents, *regexes )
@@ -199,7 +202,7 @@ def extract_multiline_directives(file_contents, directive)
     #  - Captures all text (non-greedily) after '#<directive>' on a first line through 0 or more line continuations up to a final newline.
     #  - Line continuations comprise a final '\' on a given line followed by whitespace & newline, wrapping to the next
     #    line up to a final '\' on that next line.
-    regex = /(#\s*#{directive}\s+.*?(\\\s*\n.*?)*)\n/
+    regex = /(#\s*#{directive}[^\n]*)\n/
 
     tokens = extract_tokens_by_regex_list( file_contents, regex )
 
@@ -227,7 +230,9 @@ def extract_tokens_by_regex_list(file_contents, *regexes)
 
     # For each regex provided, extract all matches from the source string
     regexes.each do |regex|
-      tokens += file_contents.scan( regex )
+      @parsing_parcels.code_lines( file_contents ) do |line|
+        tokens += line.scan( regex )
+      end
     end
 
     return tokens

diff --git a/lib/ceedling/test_context_extractor.rb b/lib/ceedling/test_context_extractor.rb
@@ -12,7 +12,7 @@
 
 class TestContextExtractor
 
-  constructor :configurator, :file_wrapper, :loginator
+  constructor :configurator, :file_wrapper, :loginator, :parsing_parcels
 
   def setup
     # Per test-file lookup hashes
@@ -53,7 +53,10 @@ def collect_simple_context( filepath, input, *args )
     source_extras = []
     includes = []
 
-    code_lines( input ) do |line|
+    @parsing_parcels.code_lines( input ) do |line|
+      # Strip out comments
+      line.gsub!(/\/\/.*/,'')
+
       if args.include?( :build_directive_include_paths )
         # Scan for build directives: TEST_INCLUDE_PATH()
         include_paths += extract_build_directive_include_paths( line )
@@ -99,7 +102,7 @@ def collect_test_runner_details(test_filepath, input_filepath=nil)
   def extract_includes(input)
     includes = []
 
-    code_lines( input ) {|line| includes += _extract_includes( line ) }
+    @parsing_parcels.code_lines( input ) {|line| includes += _extract_includes( line ) }
 
     return includes.uniq
   end
@@ -235,16 +238,6 @@ def ingest_includes(filepath, includes)
     end
   end
 
-  # Exposed for testing
-  def code_lines(input)
-    comment_block = false
-    # Far more memory efficient and faster (for large files) than slurping entire file into memory
-    input.each_line do |line|
-      _line, comment_block = clean_code_line( line, comment_block )
-      yield( _line )
-    end    
-  end
-
   private #################################
 
   def collect_build_directive_source_files(filepath, files)
@@ -293,8 +286,8 @@ def _collect_test_runner_details(filepath, test_content, input_content=nil)
   def extract_build_directive_source_files(line)
     source_extras = []
 
-    # Look for TEST_SOURCE_FILE("<*>.<*>") statement
-    results = line.scan(/#{UNITY_TEST_SOURCE_FILE}\(\s*\"\s*(.+?\.\w+)*?\s*\"\s*\)/)
+    # Look for TEST_SOURCE_FILE("<*>") statement
+    results = line.scan(/#{UNITY_TEST_SOURCE_FILE}\(\s*\"\s*([^"]+)\s*\"\s*\)/)
     results.each do |result|
       source_extras << FilePathUtils.standardize( result[0] )
     end
@@ -306,7 +299,7 @@ def extract_build_directive_include_paths(line)
     include_paths = []
 
     # Look for TEST_INCLUDE_PATH("<*>") statements
-    results = line.scan(/#{UNITY_TEST_INCLUDE_PATH}\(\s*\"\s*(.+?)\s*\"\s*\)/)
+    results = line.scan(/#{UNITY_TEST_INCLUDE_PATH}\(\s*\"\s*([^"]+)\s*\"\s*\)/)
     results.each do |result|
       include_paths << FilePathUtils.standardize( result[0] )
     end
@@ -371,46 +364,6 @@ def form_file_key( filepath )
     return filepath.to_s.to_sym
   end
 
-  def clean_code_line(line, comment_block)
-    _line = line.clean_encoding
-
-    # Remove line comments
-    _line.gsub!(/\/\/.*$/, '')
-
-    # Handle end of previously begun comment block
-    if comment_block
-      if _line.include?( '*/' )
-        # Turn off comment block handling state
-        comment_block = false
-
-        # Remove everything up to end of comment block
-        _line.gsub!(/^.*\*\//, '')
-      else
-        # Ignore contents of the line if its entirely within a comment block
-        return '', comment_block        
-      end
-
-    end
-
-    # Block comments inside a C string are valid C, but we remove to simplify other parsing.
-    # No code we care about will be inside a C string.
-    # Note that we're not attempting the complex case of multiline string enclosed comment blocks
-    _line.gsub!(/"\s*\/\*.*"/, '')
-
-    # Remove single-line block comments
-    _line.gsub!(/\/\*.*\*\//, '')
-
-    # Handle beginning of any remaining multiline comment block
-    if _line.include?( '/*' )
-      comment_block = true
-
-      # Remove beginning of block comment
-      _line.gsub!(/\/\*.*/, '')
-    end
-
-    return _line, comment_block
-  end
-
   def debug_log_list(message, filepath, list)
     msg = "#{message} in #{filepath}:"
     if list.empty?

diff --git a/spec/parsing_parcels_spec.rb b/spec/parsing_parcels_spec.rb
@@ -0,0 +1,66 @@
+# =========================================================================
+#   Ceedling - Test-Centered Build System for C
+#   ThrowTheSwitch.org
+#   Copyright (c) 2010-25 Mike Karlesky, Mark VanderVoord, & Greg Williams
+#   SPDX-License-Identifier: MIT
+# =========================================================================
+
+require 'spec_helper'
+require 'ceedling/parsing_parcels'
+
+describe ParsingParcels do
+  before(:each) do
+
+    @parsing_parcels = described_class.new()
+  end
+
+  context "#code_lines" do
+    it "should clean code of encoding problems and comments" do
+      file_contents = <<~CONTENTS
+      /* TEST_SOURCE_FILE("foo.c") */    // Eliminate single line comment block
+      // TEST_SOURCE_FILE("bar.c")       // Eliminate single line comment
+      Some text⛔️
+      /* // /*                           // Eliminate tricky comment block enclosing comments
+        TEST_SOURCE_FILE("boom.c")
+        */   //                          // Eliminate trailing single line comment following block comment
+      More text
+      #define STR1 "/* comment  "        // Strip out (single line) C string containing block comment
+      #define STR2 "  /* comment  "      // Strip out (single line) C string containing block comment
+      CONTENTS
+
+      got = []
+
+      @parsing_parcels.code_lines( StringIO.new( file_contents ) ) do |line|
+        line.strip!
+        got << line if !line.empty?
+      end
+
+      expected = [
+        'Some text', # ⛔️ removed with encoding sanitizing
+        'More text',
+        "#define STR1",
+        "#define STR2"
+      ]
+
+      expect( got ).to eq expected
+    end
+
+    it "should treat continuations as a single line" do
+      file_contents = "// TEST_SOURCE_FILE(\"foo.c\") \\  \nTEST_SOURCE_FILE(\"bar.c\")\nSome text⛔️ \\\nMore text\n"
+      got = []
+
+      @parsing_parcels.code_lines( StringIO.new( file_contents ) ) do |line|
+        line.strip!
+        got << line if !line.empty?
+      end
+
+      expected = [
+        'Some text More text'
+      ]
+
+      expect( got ).to eq expected
+    end
+
+  end
+
+end