|
22 | 22 | import re
|
23 | 23 | import sys
|
24 | 24 |
|
25 |
| -# Check for unidecode dependency early, with a clear message if missing |
| 25 | +# Check for ftfy dependency early, with a clear message if missing |
26 | 26 | try:
|
27 |
| - from unidecode import unidecode # noqa: F401 |
| 27 | + import ftfy |
28 | 28 | except ImportError:
|
29 | 29 | print(
|
30 |
| - "[✗] Missing dependency: 'Unidecode'. Please install it with:\n" |
31 |
| - " pip install Unidecode\n" |
| 30 | + "[✗] Missing dependency: 'ftfy'. Please install it with:\n" |
| 31 | + " pip install ftfy\n" |
32 | 32 | "Or install all requirements with:\n"
|
33 | 33 | " pip install -r requirements.txt",
|
34 | 34 | file=sys.stderr
|
@@ -63,6 +63,10 @@ def clean_text(text: str, preserve_invisible: bool = False) -> str:
|
63 | 63 | Returns:
|
64 | 64 | str: The cleaned text with normalized ASCII characters
|
65 | 65 | """
|
| 66 | + # Use ftfy for intelligent text fixing and normalization |
| 67 | + text = ftfy.fix_text(text) |
| 68 | + |
| 69 | + # Handle specific cases that unidecode might not handle perfectly |
66 | 70 | replacements = {
|
67 | 71 | '\u2018': "'", '\u2019': "'", # Smart single quotes
|
68 | 72 | '\u201C': '"', '\u201D': '"', # Smart double quotes
|
@@ -153,11 +157,14 @@ def main():
|
153 | 157 | # No files provided: filter mode (STDIN to STDOUT)
|
154 | 158 | raw = sys.stdin.read()
|
155 | 159 | cleaned = clean_text(raw, preserve_invisible=args.invisible)
|
156 |
| - # Add or suppress newline at EOF based on -n/--no-newline |
| 160 | + |
| 161 | + # Handle newline at EOF based on -n/--no-newline |
157 | 162 | if not args.no_newline:
|
158 |
| - cleaned = ensure_single_newline(cleaned) |
159 |
| - else: |
160 |
| - cleaned = cleaned.rstrip('\r\n') |
| 163 | + # Only add newline if there isn't one already |
| 164 | + if not cleaned.endswith('\n'): |
| 165 | + cleaned += '\n' |
| 166 | + # If --no-newline is specified, leave the file exactly as is (no changes to newlines) |
| 167 | + |
161 | 168 | sys.stdout.write(cleaned)
|
162 | 169 | return
|
163 | 170 |
|
|
0 commit comments