From 95013ba30c4780f02c37041fb03c8be7d746284e Mon Sep 17 00:00:00 2001
From: Mark VanderVoord <mvandervoord@gmail.com>
Date: Tue, 14 Jan 2025 21:15:10 -0500
Subject: [PATCH] Fix parsing issues with line continuations, macro detection,
 and comment handling. (#978, #981, #985)

---
 lib/ceedling/objects.yml                   |  5 ++
 lib/ceedling/parsing_parcels.rb            | 77 +++++++++++++++++++
 lib/ceedling/preprocessinator_extractor.rb | 13 +++-
 lib/ceedling/test_context_extractor.rb     | 65 +++-------------
 spec/parsing_parcels_spec.rb               | 66 ++++++++++++++++
 spec/preprocessinator_extractor_spec.rb    | 88 ++++++++++++++--------
 spec/test_context_extractor_spec.rb        | 36 +--------
 7 files changed, 227 insertions(+), 123 deletions(-)
 create mode 100644 lib/ceedling/parsing_parcels.rb
 create mode 100644 spec/parsing_parcels_spec.rb

diff --git a/lib/ceedling/objects.yml b/lib/ceedling/objects.yml
index f4e22481..af20521a 100644
--- a/lib/ceedling/objects.yml
+++ b/lib/ceedling/objects.yml
@@ -162,11 +162,14 @@ file_finder:
 file_finder_helper:
   compose: loginator
 
+parsing_parcels:
+
 test_context_extractor:
   compose:
     - configurator
     - file_wrapper
     - loginator
+    - parsing_parcels
 
 include_pathinator:
   compose:
@@ -287,6 +290,8 @@ preprocessinator_file_handler:
     - loginator
 
 preprocessinator_extractor:
+  compose:
+    - parsing_parcels
 
 build_batchinator:
   compose:
diff --git a/lib/ceedling/parsing_parcels.rb b/lib/ceedling/parsing_parcels.rb
new file mode 100644
index 00000000..62182b6f
--- /dev/null
+++ b/lib/ceedling/parsing_parcels.rb
@@ -0,0 +1,77 @@
+# =========================================================================
+#   Ceedling - Test-Centered Build System for C
+#   ThrowTheSwitch.org
+#   Copyright (c) 2010-25 Mike Karlesky, Mark VanderVoord, & Greg Williams
+#   SPDX-License-Identifier: MIT
+# =========================================================================
+
+require 'ceedling/encodinator'
+
+# This is a collection of parsing aids to be used in other modules
+class ParsingParcels
+
+  # This parser accepts a collection of lines which it will sweep through and tidy, giving the purified
+  # lines to the block (one line at a time) for further analysis. It analyzes a single line at a time, 
+  # which is far more memory efficient and faster for large files. However, this requires it to also 
+  # handle backslash line continuations as a single line at this point.
+  def code_lines(input)
+    comment_block = false
+    full_line = ''
+    input.each_line do |line|
+        m = line.match /(.*)\\\s*$/
+      if (!m.nil?)
+          full_line += m[1]
+      elsif full_line.empty?
+        _line, comment_block = clean_code_line( line, comment_block )
+        yield( _line )
+      else
+        _line, comment_block = clean_code_line( full_line + line, comment_block )
+        yield( _line )
+        full_line = ''
+      end
+    end    
+  end
+
+  private ######################################################################
+
+  def clean_code_line(line, comment_block)
+    _line = line.clean_encoding
+
+    # Remove line comments
+    _line.gsub!(/\/\/.*$/, '')
+
+    # Handle end of previously begun comment block
+    if comment_block
+      if _line.include?( '*/' )
+        # Turn off comment block handling state
+        comment_block = false
+        
+        # Remove everything up to end of comment block
+        _line.gsub!(/^.*\*\//, '')
+      else
+        # Ignore contents of the line if its entirely within a comment block
+        return '', comment_block        
+      end
+
+    end
+
+    # Block comments inside a C string are valid C, but we remove to simplify other parsing.
+    # No code we care about will be inside a C string.
+    # Note that we're not attempting the complex case of multiline string enclosed comment blocks
+    _line.gsub!(/"\s*\/\*.*"/, '')
+
+    # Remove single-line block comments
+    _line.gsub!(/\/\*.*\*\//, '')
+
+    # Handle beginning of any remaining multiline comment block
+    if _line.include?( '/*' )
+      comment_block = true
+
+      # Remove beginning of block comment
+      _line.gsub!(/\/\*.*/, '')
+    end
+
+    return _line, comment_block
+  end
+
+end
diff --git a/lib/ceedling/preprocessinator_extractor.rb b/lib/ceedling/preprocessinator_extractor.rb
index 3fe5ca0b..ce0ce2f3 100644
--- a/lib/ceedling/preprocessinator_extractor.rb
+++ b/lib/ceedling/preprocessinator_extractor.rb
@@ -7,9 +7,12 @@
 
 require 'ceedling/constants'
 require 'ceedling/encodinator'
+require 'ceedling/parsing_parcels'
 
 class PreprocessinatorExtractor 
  
+  constructor :parsing_parcels
+
   ##
   ## Preprocessor Expansion Output Handling
   ## ======================================
@@ -138,8 +141,8 @@ def extract_test_directive_macro_calls(file_contents)
     # Look for TEST_SOURCE_FILE("...") and TEST_INCLUDE_PATH("...") in a string (i.e. a file's contents as a string)
 
     regexes = [
-      /#{UNITY_TEST_SOURCE_FILE}.+?"\)/,
-      /#{UNITY_TEST_INCLUDE_PATH}.+?"\)/
+      /#{UNITY_TEST_SOURCE_FILE}\(\s*\"\s*[^"]+\s*\"\s*\)/,
+      /#{UNITY_TEST_INCLUDE_PATH}\(\s*\"\s*[^"]+\s*\"\s*\)/
     ]
 
     return extract_tokens_by_regex_list( file_contents, *regexes )
@@ -199,7 +202,7 @@ def extract_multiline_directives(file_contents, directive)
     #  - Captures all text (non-greedily) after '#<directive>' on a first line through 0 or more line continuations up to a final newline.
     #  - Line continuations comprise a final '\' on a given line followed by whitespace & newline, wrapping to the next
     #    line up to a final '\' on that next line.
-    regex = /(#\s*#{directive}\s+.*?(\\\s*\n.*?)*)\n/
+    regex = /(#\s*#{directive}[^\n]*)\n/
 
     tokens = extract_tokens_by_regex_list( file_contents, regex )
 
@@ -227,7 +230,9 @@ def extract_tokens_by_regex_list(file_contents, *regexes)
 
     # For each regex provided, extract all matches from the source string
     regexes.each do |regex|
-      tokens += file_contents.scan( regex )
+      @parsing_parcels.code_lines( file_contents ) do |line|
+        tokens += line.scan( regex )
+      end
     end
 
     return tokens
diff --git a/lib/ceedling/test_context_extractor.rb b/lib/ceedling/test_context_extractor.rb
index 7e848824..48277de5 100644
--- a/lib/ceedling/test_context_extractor.rb
+++ b/lib/ceedling/test_context_extractor.rb
@@ -12,7 +12,7 @@
 
 class TestContextExtractor
 
-  constructor :configurator, :file_wrapper, :loginator
+  constructor :configurator, :file_wrapper, :loginator, :parsing_parcels
 
   def setup
     # Per test-file lookup hashes
@@ -53,7 +53,10 @@ def collect_simple_context( filepath, input, *args )
     source_extras = []
     includes = []
 
-    code_lines( input ) do |line|
+    @parsing_parcels.code_lines( input ) do |line|
+      # Strip out comments
+      line.gsub!(/\/\/.*/,'')
+
       if args.include?( :build_directive_include_paths )
         # Scan for build directives: TEST_INCLUDE_PATH()
         include_paths += extract_build_directive_include_paths( line )
@@ -99,7 +102,7 @@ def collect_test_runner_details(test_filepath, input_filepath=nil)
   def extract_includes(input)
     includes = []
 
-    code_lines( input ) {|line| includes += _extract_includes( line ) }
+    @parsing_parcels.code_lines( input ) {|line| includes += _extract_includes( line ) }
 
     return includes.uniq
   end
@@ -235,16 +238,6 @@ def ingest_includes(filepath, includes)
     end
   end
 
-  # Exposed for testing
-  def code_lines(input)
-    comment_block = false
-    # Far more memory efficient and faster (for large files) than slurping entire file into memory
-    input.each_line do |line|
-      _line, comment_block = clean_code_line( line, comment_block )
-      yield( _line )
-    end    
-  end
-
   private #################################
 
   def collect_build_directive_source_files(filepath, files)
@@ -293,8 +286,8 @@ def _collect_test_runner_details(filepath, test_content, input_content=nil)
   def extract_build_directive_source_files(line)
     source_extras = []
 
-    # Look for TEST_SOURCE_FILE("<*>.<*>") statement
-    results = line.scan(/#{UNITY_TEST_SOURCE_FILE}\(\s*\"\s*(.+?\.\w+)*?\s*\"\s*\)/)
+    # Look for TEST_SOURCE_FILE("<*>") statement
+    results = line.scan(/#{UNITY_TEST_SOURCE_FILE}\(\s*\"\s*([^"]+)\s*\"\s*\)/)
     results.each do |result|
       source_extras << FilePathUtils.standardize( result[0] )
     end
@@ -306,7 +299,7 @@ def extract_build_directive_include_paths(line)
     include_paths = []
 
     # Look for TEST_INCLUDE_PATH("<*>") statements
-    results = line.scan(/#{UNITY_TEST_INCLUDE_PATH}\(\s*\"\s*(.+?)\s*\"\s*\)/)
+    results = line.scan(/#{UNITY_TEST_INCLUDE_PATH}\(\s*\"\s*([^"]+)\s*\"\s*\)/)
     results.each do |result|
       include_paths << FilePathUtils.standardize( result[0] )
     end
@@ -371,46 +364,6 @@ def form_file_key( filepath )
     return filepath.to_s.to_sym
   end
 
-  def clean_code_line(line, comment_block)
-    _line = line.clean_encoding
-
-    # Remove line comments
-    _line.gsub!(/\/\/.*$/, '')
-
-    # Handle end of previously begun comment block
-    if comment_block
-      if _line.include?( '*/' )
-        # Turn off comment block handling state
-        comment_block = false
-        
-        # Remove everything up to end of comment block
-        _line.gsub!(/^.*\*\//, '')
-      else
-        # Ignore contents of the line if its entirely within a comment block
-        return '', comment_block        
-      end
-
-    end
-
-    # Block comments inside a C string are valid C, but we remove to simplify other parsing.
-    # No code we care about will be inside a C string.
-    # Note that we're not attempting the complex case of multiline string enclosed comment blocks
-    _line.gsub!(/"\s*\/\*.*"/, '')
-
-    # Remove single-line block comments
-    _line.gsub!(/\/\*.*\*\//, '')
-
-    # Handle beginning of any remaining multiline comment block
-    if _line.include?( '/*' )
-      comment_block = true
-
-      # Remove beginning of block comment
-      _line.gsub!(/\/\*.*/, '')
-    end
-
-    return _line, comment_block
-  end
-
   def debug_log_list(message, filepath, list)
     msg = "#{message} in #{filepath}:"
     if list.empty?
diff --git a/spec/parsing_parcels_spec.rb b/spec/parsing_parcels_spec.rb
new file mode 100644
index 00000000..19e1f1db
--- /dev/null
+++ b/spec/parsing_parcels_spec.rb
@@ -0,0 +1,66 @@
+# =========================================================================
+#   Ceedling - Test-Centered Build System for C
+#   ThrowTheSwitch.org
+#   Copyright (c) 2010-25 Mike Karlesky, Mark VanderVoord, & Greg Williams
+#   SPDX-License-Identifier: MIT
+# =========================================================================
+
+require 'spec_helper'
+require 'ceedling/parsing_parcels'
+
+describe ParsingParcels do
+  before(:each) do
+
+    @parsing_parcels = described_class.new()
+  end
+
+  context "#code_lines" do
+    it "should clean code of encoding problems and comments" do
+      file_contents = <<~CONTENTS
+      /* TEST_SOURCE_FILE("foo.c") */    // Eliminate single line comment block
+      // TEST_SOURCE_FILE("bar.c")       // Eliminate single line comment
+      Some text⛔️
+      /* // /*                           // Eliminate tricky comment block enclosing comments
+        TEST_SOURCE_FILE("boom.c")
+        */   //                          // Eliminate trailing single line comment following block comment
+      More text
+      #define STR1 "/* comment  "        // Strip out (single line) C string containing block comment
+      #define STR2 "  /* comment  "      // Strip out (single line) C string containing block comment
+      CONTENTS
+
+      got = []
+
+      @parsing_parcels.code_lines( StringIO.new( file_contents ) ) do |line|
+        line.strip!
+        got << line if !line.empty?
+      end
+
+      expected = [
+        'Some text', # ⛔️ removed with encoding sanitizing
+        'More text',
+        "#define STR1",
+        "#define STR2"
+      ]
+
+      expect( got ).to eq expected
+    end
+
+    it "should treat continuations as a single line" do
+      file_contents = "// TEST_SOURCE_FILE(\"foo.c\") \\  \nTEST_SOURCE_FILE(\"bar.c\")\nSome text⛔️ \\\nMore text\n"
+      got = []
+
+      @parsing_parcels.code_lines( StringIO.new( file_contents ) ) do |line|
+        line.strip!
+        got << line if !line.empty?
+      end
+
+      expected = [
+        'Some text More text'
+      ]
+
+      expect( got ).to eq expected
+    end
+
+  end
+
+end
diff --git a/spec/preprocessinator_extractor_spec.rb b/spec/preprocessinator_extractor_spec.rb
index 6ed30ab4..843c6f48 100644
--- a/spec/preprocessinator_extractor_spec.rb
+++ b/spec/preprocessinator_extractor_spec.rb
@@ -5,9 +5,20 @@
 #   SPDX-License-Identifier: MIT
 # =========================================================================
 
+require 'spec_helper'
 require 'ceedling/preprocessinator_extractor'
+require 'ceedling/parsing_parcels'
 
 describe PreprocessinatorExtractor do
+  before(:each) do
+    @parsing_parcels = ParsingParcels.new()
+    @extractor = described_class.new(
+      {
+        :parsing_parcels => @parsing_parcels
+      }
+    )
+  end
+
   context "#extract_file_as_array_from_expansion" do
     it "should simply extract text of original file from preprocessed expansion" do
       filepath = "path/to/WANT.c"
@@ -32,7 +43,7 @@
 
       input = StringIO.new( file_contents.join( "\n" ) )
 
-      expect( subject.extract_file_as_array_from_expansion( input, filepath ) ).to eq expected
+      expect( @extractor.extract_file_as_array_from_expansion( input, filepath ) ).to eq expected
     end
 
     it "should extract text of original file from preprocessed expansion preserving #directives and cleaning up whitespace)" do
@@ -62,7 +73,7 @@
 
       input = StringIO.new( file_contents.join( "\n" ) )
 
-      expect( subject.extract_file_as_array_from_expansion( input, filepath ) ).to eq expected
+      expect( @extractor.extract_file_as_array_from_expansion( input, filepath ) ).to eq expected
     end
 
     it "should extract text of original file from preprocessed expansion with complex preprocessor directive sequence" do
@@ -97,7 +108,7 @@
 
       input = StringIO.new( file_contents.join( "\n" ) )
 
-      expect( subject.extract_file_as_array_from_expansion(input, filepath) ).to eq expected
+      expect( @extractor.extract_file_as_array_from_expansion(input, filepath) ).to eq expected
     end
   end
 
@@ -125,7 +136,7 @@
 
       input = StringIO.new( file_contents.join( "\n" ) )
 
-      expect( subject.extract_file_as_string_from_expansion( input, filepath ) ).to eq expected
+      expect( @extractor.extract_file_as_string_from_expansion( input, filepath ) ).to eq expected
     end
   end
 
@@ -146,7 +157,27 @@
         'TEST_INCLUDE_PATH("hello/there")'
       ]
 
-      expect( subject.extract_test_directive_macro_calls( file_text ) ).to eq expected
+      expect( @extractor.extract_test_directive_macro_calls( file_text ) ).to eq expected
+    end
+  end
+
+  context "#extract_test_directive_macro_calls" do
+    it "should extract only uncommented calls" do
+      file_text = <<~FILE_TEXT
+        TEST_SOURCE_FILE("foo/bar/file.c")//TEST_SOURCE_FILE("yo/data.c")
+
+            TEST_INCLUDE_PATH("some/inc/dir")
+        SOME_MACRO(TEST_INCLUDE_PATH("another/dir")) TEST_INCLUDE_PATH("hello/there")
+      FILE_TEXT
+
+      expected = [
+        'TEST_SOURCE_FILE("foo/bar/file.c")',
+        'TEST_INCLUDE_PATH("some/inc/dir")',
+        'TEST_INCLUDE_PATH("another/dir")',
+        'TEST_INCLUDE_PATH("hello/there")'
+      ]
+
+      expect( @extractor.extract_test_directive_macro_calls( file_text ) ).to eq expected
     end
   end
 
@@ -177,15 +208,15 @@
       expected = [
         "#pragma pack(1)",
         [
-          "#pragma TOOL command \\",
-          "          with_some_args  \\",
+          "#pragma TOOL command ",
+          "          with_some_args  ",
           "          that wrap"
-        ],
+        ].join,
         "#pragma warning(disable : 4996)",
         "#pragma GCC optimize(\"O3\")"
       ]
 
-      expect( subject.extract_pragmas( file_text ) ).to eq expected
+      expect( @extractor.extract_pragmas( file_text ) ).to eq expected
     end
   end
 
@@ -200,7 +231,7 @@
         #endif // _HEADER_INCLUDE_GUARD_
       FILE_TEXT
 
-      expect( subject.extract_include_guard( file_text ) ).to eq '_HEADER_INCLUDE_GUARD_'
+      expect( @extractor.extract_include_guard( file_text ) ).to eq '_HEADER_INCLUDE_GUARD_'
     end
 
     it "should extract the first text that looks like an include guard from among file text" do
@@ -216,7 +247,7 @@
         #endif // HEADER_INCLUDE_GUARD
       FILE_TEXT
 
-      expect( subject.extract_include_guard( file_text ) ).to eq 'HEADER_INCLUDE_GUARD'
+      expect( @extractor.extract_include_guard( file_text ) ).to eq 'HEADER_INCLUDE_GUARD'
     end
 
     it "should not extract an include guard from among file text" do
@@ -229,7 +260,7 @@
         #endif // SOME_GUARD_NAME
       FILE_TEXT
 
-      expect( subject.extract_include_guard( file_text ) ).to eq nil
+      expect( @extractor.extract_include_guard( file_text ) ).to eq nil
     end
   end
 
@@ -279,26 +310,26 @@
         "#define SQUARE(x) ((x) * (x))",
         "#define MAX(a, b) ((a) > (b) ? (a) : (b))",
         [
-          "#define MACRO(num, str) {\\",
-          "            printf(\"%d\", num);\\",
-          "            printf(\" is\");            \\",
-          "            printf(\" %s number\", str);\\",
-          "            printf(\"\\n\");\\",
+          "#define MACRO(num, str) {",
+          "            printf(\"%d\", num);",
+          "            printf(\" is\");            ",
+          "            printf(\" %s number\", str);",
+          "            printf(\"\\n\");",
           "           }"
-        ],
+        ].join,
         [
-          "#define LONG_STRING \"This is a very long string that \\",
+          "#define LONG_STRING \"This is a very long string that ",
           "                      continues on the next line\""
-        ],
+        ].join,
         [
-          "#define MULTILINE_MACRO do { \\",
-          "      something(); \\",
-          "      something_else(); \\",
+          "#define MULTILINE_MACRO do { ",
+          "      something(); ",
+          "      something_else(); ",
           "    } while(0)"
-        ]
+        ].join
       ]
 
-      expect( subject.extract_macro_defs( file_text, nil ) ).to eq expected
+      expect( @extractor.extract_macro_defs( file_text, nil ) ).to eq expected
     end
 
     it "should ignore include guard among macro defintions in file text" do
@@ -316,13 +347,10 @@
 
       expected = [
         "#define PI 3.14159",
-        [
-          "#define LONG_STRING \"This is a very long string that \\",
-          "                      continues on the next line\""
-        ]
+        "#define LONG_STRING \"This is a very long string that                       continues on the next line\""
       ]
 
-      expect( subject.extract_macro_defs( file_text, '_INCLUDE_GUARD_' ) ).to eq expected
+      expect( @extractor.extract_macro_defs( file_text, '_INCLUDE_GUARD_' ) ).to eq expected
     end
 
   end
diff --git a/spec/test_context_extractor_spec.rb b/spec/test_context_extractor_spec.rb
index 1c7a8833..ab0d123a 100644
--- a/spec/test_context_extractor_spec.rb
+++ b/spec/test_context_extractor_spec.rb
@@ -7,11 +7,13 @@
 
 require 'spec_helper'
 require 'ceedling/test_context_extractor'
+require 'ceedling/parsing_parcels'
 require 'ceedling/exceptions'
 
 describe TestContextExtractor do
   before(:each) do
     # Mock injected dependencies
+    @parsing_parcels = ParsingParcels.new()
     @configurator = double( "Configurator" ) # Use double() so we can mock needed methods that are added dynamically at startup
     @file_wrapper = double( "FileWrapper" ) # Not actually exercised in these test cases
     loginator = instance_double( "Loginator" )
@@ -41,6 +43,7 @@
       {
         :configurator => @configurator,
         :file_wrapper => @file_wrapper,
+        :parsing_parcels => @parsing_parcels,
         :loginator => loginator
       }
     )
@@ -94,39 +97,6 @@
     end
   end
 
-  context "#code_lines" do
-    it "should clean code of encoding problems and comments" do
-      file_contents = <<~CONTENTS
-      /* TEST_SOURCE_FILE("foo.c") */    // Eliminate single line comment block
-      // TEST_SOURCE_FILE("bar.c")       // Eliminate single line comment
-      Some text⛔️
-      /* // /*                           // Eliminate tricky comment block enclosing comments
-        TEST_SOURCE_FILE("boom.c")
-        */   //                          // Eliminate trailing single line comment following block comment
-      More text
-      #define STR1 "/* comment  "        // Strip out (single line) C string containing block comment
-      #define STR2 "  /* comment  "      // Strip out (single line) C string containing block comment
-      CONTENTS
-
-      got = []
-
-      @extractor.code_lines( StringIO.new( file_contents ) ) do |line|
-        line.strip!
-        got << line if !line.empty?
-      end
-
-      expected = [
-        'Some text', # ⛔️ removed with encoding sanitizing
-        'More text',
-        "#define STR1",
-        "#define STR2"
-      ]
-
-      expect( got ).to eq expected
-    end
-
-  end
-
   context "#extract_includes" do
     it "should extract #include directives from code" do
       # Complex comments tested in `clean_code_line()` test case