Skip to content

Commit 7cb6dca

Browse files
rthManishearth
authored andcommitted
MAINT Fixes for Python scripts (#54)
* Fixes to python scripts * Update src/testdata.rs
1 parent c7a6b6f commit 7cb6dca

File tree

3 files changed

+23
-23
lines changed

3 files changed

+23
-23
lines changed

scripts/unicode.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
# Since this should not require frequent updates, we just store this
2121
# out-of-line and check the unicode.rs file into git.
2222

23-
import fileinput, re, os, sys, operator
23+
import fileinput, re, os, sys
2424

2525
preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
2626
// file at the top-level directory of this distribution and at
@@ -59,7 +59,7 @@ def is_surrogate(n):
5959

6060
def fetch(f):
6161
if not os.path.exists(os.path.basename(f)):
62-
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
62+
os.system("curl -O http://www.unicode.org/Public/9.0.0/ucd/%s"
6363
% f)
6464

6565
if not os.path.exists(os.path.basename(f)):
@@ -80,7 +80,7 @@ def load_gencats(f):
8080
if is_surrogate(cp):
8181
continue
8282
if range_start >= 0:
83-
for i in xrange(range_start, cp):
83+
for i in range(range_start, cp):
8484
udict[i] = data;
8585
range_start = -1;
8686
if data[1].endswith(", First>"):
@@ -150,8 +150,8 @@ def format_table_content(f, content, indent):
150150
def load_properties(f, interestingprops):
151151
fetch(f)
152152
props = {}
153-
re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)")
154-
re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
153+
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
154+
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
155155

156156
for line in fileinput.input(os.path.basename(f)):
157157
prop = None
@@ -309,7 +309,7 @@ def emit_break_module(f, break_table, break_cats, name):
309309
# download and parse all the data
310310
fetch("ReadMe.txt")
311311
with open("ReadMe.txt") as readme:
312-
pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
312+
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
313313
unicode_version = re.search(pattern, readme.read()).groups()
314314
rf.write("""
315315
/// The version of [Unicode](http://www.unicode.org/)
@@ -342,19 +342,19 @@ def emit_break_module(f, break_table, break_cats, name):
342342
for cat in grapheme_cats:
343343
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
344344
grapheme_table.sort(key=lambda w: w[0])
345-
emit_break_module(rf, grapheme_table, grapheme_cats.keys(), "grapheme")
345+
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
346346
rf.write("\n")
347347

348348
word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
349349
word_table = []
350350
for cat in word_cats:
351351
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
352352
word_table.sort(key=lambda w: w[0])
353-
emit_break_module(rf, word_table, word_cats.keys(), "word")
353+
emit_break_module(rf, word_table, list(word_cats.keys()), "word")
354354

355355
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
356356
sentence_table = []
357357
for cat in sentence_cats:
358358
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
359359
sentence_table.sort(key=lambda w: w[0])
360-
emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")
360+
emit_break_module(rf, sentence_table, list(sentence_cats.keys()), "sentence")

scripts/unicode_gen_breaktests.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,23 @@
1717
#
1818
# Since this should not require frequent updates, we just store this
1919
# out-of-line and check the unicode.rs file into git.
20+
from __future__ import print_function
2021

2122
import unicode, re, os, fileinput
2223

2324
def load_test_data(f, optsplit=[]):
24-
outls = []
25-
testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
25+
testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
2626

2727
unicode.fetch(f)
2828
data = []
2929
for line in fileinput.input(os.path.basename(f)):
3030
# lines that include a test start with the ÷ character
31-
if len(line) < 2 or line[0:2] != '÷':
31+
if len(line) < 2 or not line.startswith('÷'):
3232
continue
3333

3434
m = testRe1.match(line)
3535
if not m:
36-
print "error: no match on line where test was expected: %s" % line
36+
print("error: no match on line where test was expected: %s" % line)
3737
continue
3838

3939
# process the characters in this test case
@@ -48,9 +48,9 @@ def load_test_data(f, optsplit=[]):
4848
# make sure that we have break info for each break!
4949
assert len(chars) - 1 == len(info)
5050

51-
outls.append((chars, info))
51+
data.append((chars, info))
5252

53-
return outls
53+
return data
5454

5555
def process_split_info(s, c, o):
5656
outcs = []
@@ -59,7 +59,7 @@ def process_split_info(s, c, o):
5959

6060
# are we on a × or a ÷?
6161
isX = False
62-
if s[0:2] == '×':
62+
if s.startswith('×'):
6363
isX = True
6464

6565
# find each instance of '(÷|×) [x.y] '
@@ -81,10 +81,10 @@ def process_split_info(s, c, o):
8181

8282
idx = 1
8383
while idx < len(s):
84-
if s[idx:idx+2] == '×':
84+
if s[idx:].startswith('×'):
8585
isX = True
8686
break
87-
if s[idx:idx+2] == '÷':
87+
if s[idx:].startswith('÷'):
8888
isX = False
8989
break
9090
idx += 1
@@ -172,7 +172,7 @@ def create_grapheme_data(f):
172172
stype = "&'static [(&'static str, &'static [&'static str])]"
173173
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
174174
f.write(" // official Unicode test data\n")
175-
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
175+
f.write(" // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
176176
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
177177
unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
178178

@@ -187,7 +187,7 @@ def create_words_data(f):
187187

188188
wtype = "&'static [(&'static str, &'static [&'static str])]"
189189
f.write(" // official Unicode test data\n")
190-
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
190+
f.write(" // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakTest.txt\n")
191191
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
192192

193193
def create_sentence_data(f):

src/testdata.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
1+
// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
22
// file at the top-level directory of this distribution and at
33
// http://rust-lang.org/COPYRIGHT.
44
//
@@ -12,7 +12,7 @@
1212

1313
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
1414
// official Unicode test data
15-
// http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt
15+
// http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.txt
1616
pub const TEST_SAME: &'static [(&'static str, &'static [&'static str])] = &[
1717
("\u{20}\u{20}", &["\u{20}", "\u{20}"]), ("\u{20}\u{308}\u{20}", &["\u{20}\u{308}",
1818
"\u{20}"]), ("\u{20}\u{d}", &["\u{20}", "\u{d}"]), ("\u{20}\u{308}\u{d}", &["\u{20}\u{308}",
@@ -516,7 +516,7 @@
516516
];
517517

518518
// official Unicode test data
519-
// http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
519+
// http://www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakTest.txt
520520
pub const TEST_WORD: &'static [(&'static str, &'static [&'static str])] = &[
521521
("\u{1}\u{1}", &["\u{1}", "\u{1}"]), ("\u{1}\u{308}\u{1}", &["\u{1}\u{308}", "\u{1}"]),
522522
("\u{1}\u{d}", &["\u{1}", "\u{d}"]), ("\u{1}\u{308}\u{d}", &["\u{1}\u{308}", "\u{d}"]),

0 commit comments

Comments
 (0)