ruby-syntax-tree · kddnewton · Feb 9, 2023 · Feb 8, 2023 · Feb 9, 2023
diff --git a/lib/syntax_tree/formatter.rb b/lib/syntax_tree/formatter.rb
@@ -138,7 +138,7 @@ def format(node, stackable: true)
         # going to just print out the node as it was seen in the source.
         doc =
           if last_leading&.ignore?
-            range = source[node.location.start_char...node.location.end_char]
+            range = source[node.start_char...node.end_char]
             first = true
 
             range.each_line(chomp: true) do |line|

diff --git a/lib/syntax_tree/node.rb b/lib/syntax_tree/node.rb
@@ -126,6 +126,14 @@ def format(q)
       raise NotImplementedError
     end
 
+    def start_char
+      location.start_char
+    end
+
+    def end_char
+      location.end_char
+    end
+
     def pretty_print(q)
       accept(Visitor::PrettyPrintVisitor.new(q))
     end

diff --git a/lib/syntax_tree/parser.rb b/lib/syntax_tree/parser.rb
@@ -256,11 +256,37 @@ def find_token(type)
       tokens[index] if index
     end
 
+    def find_token_between(type, left, right)
+      bounds = left.location.end_char...right.location.start_char
+      index =
+        tokens.rindex do |token|
+          char = token.location.start_char
+          break if char < bounds.begin
+
+          token.is_a?(type) && bounds.cover?(char)
+        end
+
+      tokens[index] if index
+    end
+
     def find_keyword(name)
       index = tokens.rindex { |token| token.is_a?(Kw) && (token.name == name) }
       tokens[index] if index
     end
 
+    def find_keyword_between(name, left, right)
+      bounds = left.end_char...right.start_char
+      index =
+        tokens.rindex do |token|
+          char = token.location.start_char
+          break if char < bounds.begin
+
+          token.is_a?(Kw) && (token.name == name) && bounds.cover?(char)
+        end
+
+      tokens[index] if index
+    end
+
     def find_operator(name)
       index = tokens.rindex { |token| token.is_a?(Op) && (token.name == name) }
       tokens[index] if index
@@ -645,7 +671,7 @@ def visit_var_ref(node)
       end
 
       def self.visit(node, tokens)
-        start_char = node.location.start_char
+        start_char = node.start_char
         allocated = []
 
         tokens.reverse_each do |token|
@@ -874,13 +900,34 @@ def on_binary(left, operator, right)
     #   on_block_var: (Params params, (nil | Array[Ident]) locals) -> BlockVar
     def on_block_var(params, locals)
       index =
-        tokens.rindex do |node|
-          node.is_a?(Op) && %w[| ||].include?(node.value) &&
-            node.location.start_char < params.location.start_char
-        end
+        tokens.rindex { |node| node.is_a?(Op) && %w[| ||].include?(node.value) }
+
+      ending = tokens.delete_at(index)
+      beginning = ending.value == "||" ? ending : consume_operator(:|)
+
+      # If there are no parameters, then we didn't have anything to base the
+      # location information of off. Now that we have an opening of the
+      # block, we can correct this.
+      if params.empty?
+        start_line = params.location.start_line
+        start_char =
+          (
+            if beginning.value == "||"
+              beginning.location.start_char
+            else
+              find_next_statement_start(beginning.location.end_char)
+            end
+          )
 
-      beginning = tokens[index]
-      ending = tokens[-1]
+        location =
+          Location.fixed(
+            line: start_line,
+            char: start_char,
+            column: start_char - line_counts[start_line - 1].start
+          )
+
+        params = params.copy(location: location)
+      end
 
       BlockVar.new(
         params: params,
@@ -1760,21 +1807,19 @@ def on_for(index, collection, statements)
       in_keyword = consume_keyword(:in)
       ending = consume_keyword(:end)
 
-      # Consume the do keyword if it exists so that it doesn't get confused for
-      # some other block
-      keyword = find_keyword(:do)
-      if keyword &&
-           keyword.location.start_char > collection.location.end_char &&
-           keyword.location.end_char < ending.location.start_char
-        tokens.delete(keyword)
-      end
+      delimiter =
+        find_keyword_between(:do, collection, ending) ||
+          find_token_between(Semicolon, collection, ending)
+
+      tokens.delete(delimiter) if delimiter
 
       start_char =
-        find_next_statement_start((keyword || collection).location.end_char)
+        find_next_statement_start((delimiter || collection).location.end_char)
+
       statements.bind(
         start_char,
         start_char -
-          line_counts[(keyword || collection).location.end_line - 1].start,
+          line_counts[(delimiter || collection).location.end_line - 1].start,
         ending.location.start_char,
         ending.location.start_column
       )
@@ -1984,7 +2029,12 @@ def on_if(predicate, statements, consequent)
       beginning = consume_keyword(:if)
       ending = consequent || consume_keyword(:end)
 
-      start_char = find_next_statement_start(predicate.location.end_char)
+      if (keyword = find_keyword_between(:then, predicate, ending))
+        tokens.delete(keyword)
+      end
+
+      start_char =
+        find_next_statement_start((keyword || predicate).location.end_char)
       statements.bind(
         start_char,
         start_char - line_counts[predicate.location.end_line - 1].start,
@@ -2068,7 +2118,8 @@ def on_in(pattern, statements, consequent)
         statements_start = token
       end
 
-      start_char = find_next_statement_start(statements_start.location.end_char)
+      start_char =
+        find_next_statement_start((token || statements_start).location.end_char)
       statements.bind(
         start_char,
         start_char -
@@ -2194,12 +2245,19 @@ def on_lambda(params, statements)
             token.location.start_char > beginning.location.start_char
         end
 
+      if braces
+        opening = consume_token(TLamBeg)
+        closing = consume_token(RBrace)
+      else
+        opening = consume_keyword(:do)
+        closing = consume_keyword(:end)
+      end
+
       # We need to do some special mapping here. Since ripper doesn't support
-      # capturing lambda var until 3.2, we need to normalize all of that here.
+      # capturing lambda vars, we need to normalize all of that here.
       params =
-        case params
-        when Paren
-          # In this case we've gotten to the <3.2 parentheses wrapping a set of
+        if params.is_a?(Paren)
+          # In this case we've gotten to the parentheses wrapping a set of
           # parameters case. Here we need to manually scan for lambda locals.
           range = (params.location.start_char + 1)...params.location.end_char
           locals = lambda_locals(source[range])
@@ -2221,25 +2279,28 @@ def on_lambda(params, statements)
 
           node.comments.concat(params.comments)
           node
-        when Params
-          # In this case we've gotten to the <3.2 plain set of parameters. In
-          # this case there cannot be lambda locals, so we will wrap the
-          # parameters into a lambda var that has no locals.
+        else
+          # If there are no parameters, then we didn't have anything to base the
+          # location information of off. Now that we have an opening of the
+          # block, we can correct this.
+          if params.empty?
+            opening_location = opening.location
+            location =
+              Location.fixed(
+                line: opening_location.start_line,
+                char: opening_location.start_char,
+                column: opening_location.start_column
+              )
+
+            params = params.copy(location: location)
+          end
+
+          # In this case we've gotten to the plain set of parameters. In this
+          # case there cannot be lambda locals, so we will wrap the parameters
+          # into a lambda var that has no locals.
           LambdaVar.new(params: params, locals: [], location: params.location)
-        when LambdaVar
-          # In this case we've gotten to 3.2+ lambda var. In this case we don't
-          # need to do anything and can just the value as given.
-          params
         end
 
-      if braces
-        opening = consume_token(TLamBeg)
-        closing = consume_token(RBrace)
-      else
-        opening = consume_keyword(:do)
-        closing = consume_keyword(:end)
-      end
-
       start_char = find_next_statement_start(opening.location.end_char)
       statements.bind(
         start_char,
@@ -3134,7 +3195,7 @@ def on_rescue(exceptions, variable, statements, consequent)
       exceptions = exceptions[0] if exceptions.is_a?(Array)
 
       last_node = variable || exceptions || keyword
-      start_char = find_next_statement_start(last_node.location.end_char)
+      start_char = find_next_statement_start(last_node.end_char)
       statements.bind(
         start_char,
         start_char - line_counts[last_node.location.start_line - 1].start,
@@ -3156,7 +3217,7 @@ def on_rescue(exceptions, variable, statements, consequent)
                 start_char: keyword.location.end_char + 1,
                 start_column: keyword.location.end_column + 1,
                 end_line: last_node.location.end_line,
-                end_char: last_node.location.end_char,
+                end_char: last_node.end_char,
                 end_column: last_node.location.end_column
               )
           )
@@ -3267,9 +3328,29 @@ def on_sclass(target, bodystmt)
       )
     end
 
-    # def on_semicolon(value)
-    #   value
-    # end
+    # Semicolons are tokens that get added to the token list but never get
+    # attached to the AST. Because of this they only need to track their
+    # associated location so they can be used for computing bounds.
+    class Semicolon
+      attr_reader :location
+
+      def initialize(location)
+        @location = location
+      end
+    end
+
+    # :call-seq:
+    #   on_semicolon: (String value) -> Semicolon
+    def on_semicolon(value)
+      tokens << Semicolon.new(
+        Location.token(
+          line: lineno,
+          char: char_pos,
+          column: current_column,
+          size: value.size
+        )
+      )
+    end
 
     # def on_sp(value)
     #   value
@@ -3706,7 +3787,12 @@ def on_unless(predicate, statements, consequent)
       beginning = consume_keyword(:unless)
       ending = consequent || consume_keyword(:end)
 
-      start_char = find_next_statement_start(predicate.location.end_char)
+      if (keyword = find_keyword_between(:then, predicate, ending))
+        tokens.delete(keyword)
+      end
+
+      start_char =
+        find_next_statement_start((keyword || predicate).location.end_char)
       statements.bind(
         start_char,
         start_char - line_counts[predicate.location.end_line - 1].start,
@@ -3742,16 +3828,16 @@ def on_until(predicate, statements)
       beginning = consume_keyword(:until)
       ending = consume_keyword(:end)
 
-      # Consume the do keyword if it exists so that it doesn't get confused for
-      # some other block
-      keyword = find_keyword(:do)
-      if keyword && keyword.location.start_char > predicate.location.end_char &&
-           keyword.location.end_char < ending.location.start_char
-        tokens.delete(keyword)
-      end
+      delimiter =
+        find_keyword_between(:do, predicate, statements) ||
+          find_token_between(Semicolon, predicate, statements)
+
+      tokens.delete(delimiter) if delimiter
 
       # Update the Statements location information
-      start_char = find_next_statement_start(predicate.location.end_char)
+      start_char =
+        find_next_statement_start((delimiter || predicate).location.end_char)
+
       statements.bind(
         start_char,
         start_char - line_counts[predicate.location.end_line - 1].start,
@@ -3845,7 +3931,8 @@ def on_when(arguments, statements, consequent)
         statements_start = token
       end
 
-      start_char = find_next_statement_start(statements_start.location.end_char)
+      start_char =
+        find_next_statement_start((token || statements_start).location.end_char)
 
       statements.bind(
         start_char,
@@ -3869,16 +3956,16 @@ def on_while(predicate, statements)
       beginning = consume_keyword(:while)
       ending = consume_keyword(:end)
 
-      # Consume the do keyword if it exists so that it doesn't get confused for
-      # some other block
-      keyword = find_keyword(:do)
-      if keyword && keyword.location.start_char > predicate.location.end_char &&
-           keyword.location.end_char < ending.location.start_char
-        tokens.delete(keyword)
-      end
+      delimiter =
+        find_keyword_between(:do, predicate, statements) ||
+          find_token_between(Semicolon, predicate, statements)
+
+      tokens.delete(delimiter) if delimiter
 
       # Update the Statements location information
-      start_char = find_next_statement_start(predicate.location.end_char)
+      start_char =
+        find_next_statement_start((delimiter || predicate).location.end_char)
+
       statements.bind(
         start_char,
         start_char - line_counts[predicate.location.end_line - 1].start,

diff --git a/lib/syntax_tree/translation.rb b/lib/syntax_tree/translation.rb
@@ -13,5 +13,16 @@ def self.to_parser(node, buffer)
 
       node.accept(Parser.new(buffer))
     end
+
+    # This method translates the given node into the representation defined by
+    # the rubocop/rubocop-ast gem. We don't explicitly list it as a dependency
+    # because it's not required for the core functionality of Syntax Tree.
+    def self.to_rubocop_ast(node, buffer)
+      require "rubocop/ast"
+      require_relative "translation/parser"
+      require_relative "translation/rubocop_ast"
+
+      node.accept(RuboCopAST.new(buffer))
+    end
   end
 end