Support non-ASCII spaces with new parser robotframework#3121

pekkaklarck · yanne · commit 41e9cd74aa09 · 2019-09-03T07:29:37.000+03:00
They can be used
- as separators everywhere,
- in arguments,
- in setting/header names,
- etc.

For loop support isn't fully ready yet.
diff --git a/atest/robot/parsing/non_ascii_spaces.robot b/atest/robot/parsing/non_ascii_spaces.robot
@@ -5,58 +5,19 @@ Resource         atest_resource.robot
 *** Test Cases ***
 In suite settings
     ${tc} =    Check Test Case    In test and keywords
-    Check Log Message    ${tc.setup.kws[0].msgs[0]}       ': :'
-    Check Log Message    ${tc.teardown.kws[0].msgs[0]}    ': :'
-    Normalization deprecated    0    Test\\xa0Setup                  2
-    Normalization deprecated    1    No-break\\xa0space              2
-    Normalization deprecated    2    :\\xa0:                         2
-    Normalization deprecated    3    Test\\u1680Teardown             3
-    Normalization deprecated    4    Ogham\\u1680space\\u1680mark    3
-    Normalization deprecated    5    :\\u1680:                       3
-
-In variables
-    Normalization deprecated    6    \${NO-BREAK\\xa0SPACE}\\xa0=    6
-    Normalization deprecated    7    :\\xa0:                         6
-    Normalization deprecated    8    \${OGHAM\\u1680SPACE\\u1680MARK}\\u1680=    7
-    Normalization deprecated    9    :\\u1680:                                   7
-    Normalization deprecated    10   \${IDEOGRAPHIC\\u3000SPACE}\\u3000=         8
-    Normalization deprecated    11   :\\u3000:                                   8
+    Check Log Message    ${tc.setup.kws[0].msgs[0]}       ':\\xa0:'
+    Check Log Message    ${tc.setup.kws[1].msgs[0]}       : :
+    Check Log Message    ${tc.teardown.kws[0].msgs[0]}    ':\\u1680:'
+    Check Log Message    ${tc.teardown.kws[1].msgs[0]}    : :
 
 In test and keywords
     ${tc} =    Check Test Case    ${TESTNAME}
-    Normalization deprecated    12    [\\xa0Tags\\u1680]                  14
-    Normalization deprecated    13    NBSP\\xa0and\\u1680Ogham            14
-    Normalization deprecated    14    \${x}\\xa0=                         15
-    Normalization deprecated    15    No-break\\xa0space                  15
-    Normalization deprecated    16    :\\xa0:                             15
-    Normalization deprecated    17    \${x}\\u1680=                       16
-    Normalization deprecated    18    Ogham\\u1680space\\u1680mark        16
-    Normalization deprecated    19    :\\u1680:                           16
-    Normalization deprecated    20    \${x}\\u3000=                       17
-    Normalization deprecated    21    Ideographic\\u3000space             17
-    Normalization deprecated    22    :\\u3000:                           17
-    Normalization deprecated    23    No-break\\xa0space                  21
-    Normalization deprecated    24    :\\xa0:                             21
-    Normalization deprecated    25    No-break\\xa0space                  25
-    Normalization deprecated    26    :\\xa0:                             25
-    Normalization deprecated    27    No-break\\xa0space                  28
-    Normalization deprecated    28    [\\xa0Arguments\\xa0]               29
-    Normalization deprecated    29    Should\\xa0be\\xa0equal             31
-    Normalization deprecated    30    Should\\xa0be\\xa0equal             32
-    Normalization deprecated    31    Should\\xa0be\\xa0equal             33
-    Normalization deprecated    32    \${NO-BREAK\\xa0SPACE}              33
-    Normalization deprecated    33    Ogham\\u1680space\\u1680mark        35
-    Normalization deprecated    34    [\\u1680Arguments\\u1680]           36
-    Normalization deprecated    35    Should\\u1680be\\u1680equal         38
-    Normalization deprecated    36    Should\\u1680be\\u1680equal         39
-    Normalization deprecated    37    Should\\u1680be\\u1680equal         40
-    Normalization deprecated    38    \${OGHAM\\u1680SPACE\\u1680MARK}    40
-    Normalization deprecated    39    Ideographic\\u3000space             42
-    Normalization deprecated    40    [\\u3000Arguments\\u3000]           43
-    Normalization deprecated    41    Should\\u3000be\\u3000equal         45
-    Normalization deprecated    42    Should\\u3000be\\u3000equal         46
-    Normalization deprecated    43    Should\\u3000be\\u3000equal         47
-    Normalization deprecated    44    \${IDEOGRAPHIC\\u3000SPACE}         47
+    Check Log Message    ${tc.kws[0].kws[0].msgs[0]}      ':\\xa0:'
+    Check Log Message    ${tc.kws[0].kws[1].msgs[0]}      : :
+    Check Log Message    ${tc.kws[1].kws[0].msgs[0]}      ':\\u1680:'
+    Check Log Message    ${tc.kws[1].kws[1].msgs[0]}      : :
+    Check Log Message    ${tc.kws[2].kws[0].msgs[0]}      ':\\u3000:'
+    Check Log Message    ${tc.kws[2].kws[1].msgs[0]}      :　:
 
 As separator
     Check Test Case    ${TESTNAME}
@@ -66,13 +27,12 @@ With pipes
 
 In header
     Check Test Case    ${TESTNAME}
-    Normalization deprecated    45    ***\\xa0Test\\u1680Cases\\u3000***    49
 
-*** Keywords ***
-Normalization deprecated
-    [Arguments]    ${index}    ${text}    ${line}
-    ${path} =    Normalize Path    ${DATADIR}/parsing/non_ascii_spaces.robot
-    ${msg} =    Catenate
-    ...    Converting whitespace characters to ASCII spaces during parsing is deprecated.
-    ...    Fix '${text}' in file '${path}' on line ${line}.
-    Check Log Message    ${ERRORS}[${index}]    ${msg}    WARN
+In test casename
+    Check Test Case    ${TESTNAME}
+
+In WITH NAME
+    Check Test Case    ${TESTNAME}
+
+In FOR separator
+    Check Test Case    ${TESTNAME}
diff --git a/atest/robot/parsing/non_breaking_space.robot b/atest/robot/parsing/non_breaking_space.robot
diff --git a/atest/testdata/parsing/nbsp.robot b/atest/testdata/parsing/nbsp.robot
diff --git a/atest/testdata/parsing/nbsp.tsv b/atest/testdata/parsing/nbsp.tsv
diff --git a/atest/testdata/parsing/non_ascii_spaces.robot b/atest/testdata/parsing/non_ascii_spaces.robot
@@ -1,51 +1,73 @@
 *** Settings ***
 Test Setup        No-break space       : :
 Test Teardown     Ogham space mark     : :
+Library  　  　  　OperatingSystem  　  　WITH NAME  　OS
 
 *** Variables ***
 ${NO-BREAK SPACE} =         : :
-${OGHAM SPACE MARK} =       : :
-${IDEOGRAPHIC　SPACE}　=    :　:
+${OGHAM SPACE MARK} =       : :  　  　# Trailing  　  　  　
+${IDEOGRAPHIC　SPACE}　=    :　:  　  　  　  　  　  　
 
 *** Test Cases ***
 In test and keywords
     [Documentation]    Used in keyword name, arguments and assign.
-    ...                In RF 3.2 arguments shouldn't be normalized.
     [ Tags ]    NBSP and Ogham
     ${x} =    No-break space        : :
-    ${x} =    Ogham space mark       : :
-    ${x}　=    Ideographic　space    :　:
+    ${x} =    Ogham space mark       : :  　  　# Trailing  　  　  　
+    ${x}　=    Ideographic　space    :　:  　  　  　  　  　  　
 
 As separator
-    [Documentation]    In RF 3.1 only NBSP works
     No-break space    : :
+     Ogham space mark    : :
+　　　　Ideographic　space　　　　:　:
 
 With pipes
-    [Documentation]    In RF 3.1 only NBSP works
 | | No-break space  | : : |
+| | Ogham space mark | : : |
+|　|　Ideographic　space　|　:　:　|
 
 *** Keywords ***
 No-break space
     [ Arguments ]    ${arg}
-    Log    ${arg}    repr=True
-    Should be equal    ${arg}    : :
+    Log    ${arg}    formatter=repr
+    Log    ${arg}
+    Should be equal    ${arg}    : :
     Should be equal    ${arg}    ${NO-BREAK SPACE}
     Should be equal    ${arg}    ${NO-BREAK SPACE}
 
 Ogham space mark
     [ Arguments ]    ${arg}
-    Log    ${arg}    repr=True
-    Should be equal    ${arg}    : :
+    Log    ${arg}    formatter=repr
+    Log    ${arg}
+    Should be equal    ${arg}    : :
     Should be equal    ${arg}    ${OGHAM SPACE MARK}
     Should be equal    ${arg}    ${OGHAM SPACE MARK}
 
 Ideographic　space
     [　Arguments　]    ${arg}
-    Log    ${arg}    repr=True
-    Should　be　equal    ${arg}    : :
+    Log    ${arg}    formatter=repr
+    Log    ${arg}
+    Should　be　equal    ${arg}    :　:
     Should　be　equal    ${arg}    ${IDEOGRAPHIC SPACE}
     Should　be　equal    ${arg}    ${IDEOGRAPHIC　SPACE}
 
 *** Test Cases　***
 In header
     No operation
+
+In test case　name
+    No operation
+
+In WITH NAME
+    OS.Directory Should Exist    ${CURDIR}
+
+In FOR separator
+    FOR    ${index}    IN RANGE    1
+        Should Be Equal    ${index}    ${0}
+    END
+    FOR    ${index}    ${item}    IN ENUMERATE    value
+        Should Be Equal    ${index}: ${item}    0: value
+    END
+    FOR    ${tag}    IN　ZIP    ${TEST TAGS}
+        Fail    Should not be executed
+    END
diff --git a/src/robot/parsing/lexer/lexers.py b/src/robot/parsing/lexer/lexers.py
@@ -14,6 +14,7 @@
 #  limitations under the License.
 
 from robot.variables import is_var
+from robot.utils import normalize_whitespace
 
 from .tokens import Token
 
@@ -107,10 +108,13 @@ class SectionLexer(BlockLexer):
 
     @classmethod
     def handles(cls, statement):
-        # TODO: Non-ASCII spaces
         marker = statement[0].value
         return (marker.startswith('*') and
-                marker.strip('* ').title() in cls.markers)
+                cls._normalize(marker) in cls.markers)
+
+    @classmethod
+    def _normalize(cls, marker):
+        return normalize_whitespace(marker).strip('* ').title()
 
     def accepts_more(self, statement):
         return not statement[0].value.startswith('*')
@@ -218,7 +222,6 @@ def lex(self, ctx):
 
 
 class TestCaseSectionLexer(SectionLexer):
-    # FIXME: Non-ASCII spaces
     markers = ('Test Case', 'Test Cases', 'Task', 'Tasks')
 
     def lexer_classes(self):
@@ -302,6 +305,7 @@ def handles(cls, statement):
 
 
 class ForLoopLexer(StatementLexer):
+    _separators = ('IN', 'IN RANGE', 'IN ENUMERATE', 'IN ZIP')
 
     @classmethod
     def handles(cls, statement):
@@ -310,7 +314,7 @@ def handles(cls, statement):
                 marker.startswith(':') and
                 marker.replace(':', '').replace(' ', '').upper() == 'FOR')
 
-    def lex(self, ctc):
+    def lex(self, ctx):
         separator_seen = False
         arguments_seen = False
         self.statement[0].type = Token.FOR
@@ -325,8 +329,7 @@ def lex(self, ctc):
     def _is_separator(self, value, arguments_seen, separator_seen):
         if separator_seen or not arguments_seen:
             return False
-        # FIXME: Non-ASCII spaces
-        return value in ('IN', 'IN RANGE', 'IN ENUMERATE', 'IN ZIP')
+        return normalize_whitespace(value) in self._separators
 
 
 class EndLexer(StatementLexer):
diff --git a/src/robot/parsing/lexer/settings.py b/src/robot/parsing/lexer/settings.py
@@ -13,6 +13,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+from robot.utils import normalize_whitespace
+
 from .tokens import Token
 
 
@@ -51,7 +53,7 @@ def _validate(self, name, normalized, statement):
                              % (name, len(statement) - 1))
 
     def _normalize_name(self, name):
-        upper = name.upper()  # TODO: Non-ASCII spaces
+        upper = normalize_whitespace(name).upper()
         if upper in self.aliases:
             return self.aliases[upper]
         return upper
@@ -61,7 +63,6 @@ def _format_name(self, name):
 
 
 class TestCaseFileSettings(Settings):
-    # FIXME: Non-ASCII spaces
     names = (
         'DOCUMENTATION',
         'SUITE SETUP',
diff --git a/src/robot/parsing/nodes.py b/src/robot/parsing/nodes.py
@@ -16,6 +16,8 @@
 from ast import AST
 import re
 
+from robot.utils import normalize_whitespace
+
 
 class Node(AST):
     _fields = ()
@@ -27,8 +29,8 @@ def _add_joiners(self, values):
                 yield self._joiner_based_on_eol_escapes(item)
 
     def _joiner_based_on_eol_escapes(self, item):
-        _end_of_line_escapes = re.compile(r'(\\+)n?$')
-        match = _end_of_line_escapes.search(item)
+        eol_escapes = re.compile(r'(\\+)n?$')
+        match = eol_escapes.search(item)
         if match and len(match.group(1)) % 2 == 1:
             return ''
         return '\n'
@@ -74,7 +76,7 @@ class TestCaseSection(Node):
 
     def __init__(self, tests, header):
         self.tests = tests
-        self.header = header[0].strip("*").strip()
+        self.header = header[0].strip('*').strip()
 
 
 class KeywordSection(Node):
@@ -151,7 +153,7 @@ def __init__(self, name, args):
         self.alias = alias
 
     def _split_alias(self, args):
-        if len(args) > 1 and args[-2] == 'WITH NAME':
+        if len(args) > 1 and normalize_whitespace(args[-2]) == 'WITH NAME':
             return args[:-2], args[-1]
         return args, None
 
diff --git a/src/robot/utils/__init__.py b/src/robot/utils/__init__.py
@@ -51,7 +51,7 @@
 from .match import eq, Matcher, MultiMatcher
 from .misc import (plural_or_not, printable_name, roundup, seq2str,
                    seq2str2)
-from .normalizing import lower, normalize, NormalizedDict
+from .normalizing import lower, normalize, normalize_whitespace, NormalizedDict
 from .platform import (IRONPYTHON, JAVA_VERSION, JYTHON, PY_VERSION,
                        PY2, PY3, PYPY, UNIXY, WINDOWS, RERAISED_EXCEPTIONS)
 from .recommendations import RecommendationFinder
diff --git a/src/robot/utils/normalizing.py b/src/robot/utils/normalizing.py
@@ -17,6 +17,7 @@
     from collections.abc import MutableMapping
 except ImportError:
     from collections import MutableMapping
+import re
 
 from .platform import IRONPYTHON, PY_VERSION, PY3
 from .robottypes import is_dict_like, is_unicode
@@ -45,6 +46,10 @@ def normalize(string, ignore=(), caseless=True, spaceless=True):
     return string
 
 
+def normalize_whitespace(string):
+    return re.sub(r'\s', ' ', string, flags=re.UNICODE)
+
+
 # http://ironpython.codeplex.com/workitem/33133
 if IRONPYTHON and PY_VERSION < (2, 7, 5):
     def lower(string):