-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
FIX downcast nominal features whenever possible in LIAC-ARFF #22354
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -280,21 +280,34 @@ def _unquote(v): | |
return v | ||
|
||
|
||
def _downcast(value): | ||
"""Downcast a value to integral or float type if possible.""" | ||
if value is None: | ||
return value | ||
try: | ||
return int(value) | ||
except ValueError: | ||
try: | ||
return float(value) | ||
except ValueError: | ||
return value | ||
|
||
|
||
def _parse_values(s): | ||
'''(INTERNAL) Split a line into a list of values''' | ||
if not _RE_NONTRIVIAL_DATA.search(s): | ||
# Fast path for trivial cases (unfortunately we have to handle missing | ||
# values because of the empty string case :(.) | ||
return [None if s in ('?', '') else s | ||
return [None if s in ('?', '') else _downcast(s) | ||
for s in next(csv.reader([s]))] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is so confusing to reuse There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed and do not look around too much :P |
||
|
||
# _RE_DENSE_VALUES tokenizes despite quoting, whitespace, etc. | ||
values, errors = zip(*_RE_DENSE_VALUES.findall(',' + s)) | ||
if not any(errors): | ||
return [_unquote(v) for v in values] | ||
return [_downcast(_unquote(v)) for v in values] | ||
if _RE_SPARSE_LINE.match(s): | ||
try: | ||
return {int(k): _unquote(v) | ||
return {int(k): _downcast(_unquote(v)) | ||
for k, v in _RE_SPARSE_KEY_VALUES.findall(s)} | ||
except ValueError: | ||
# an ARFF syntax error in sparse data | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we decide to pursue this PR anyway, please write a dedicate unittest for this helper function in isolation.