diff --git a/.github/workflows/rake.yml b/.github/workflows/rake.yml index 45725566..6a789f3e 100644 --- a/.github/workflows/rake.yml +++ b/.github/workflows/rake.yml @@ -14,18 +14,8 @@ jobs: strategy: fail-fast: false matrix: - ruby: [ "3.0", 2.7, 2.6 ] + ruby: [ 3.3, 3.2, 3.1, "3.0", 2.7, 2.6 ] os: [ ubuntu-latest, windows-latest, macos-latest ] - include: - - ruby: "3.1" - os: 'ubuntu-latest' - - ruby: "3.2" - os: 'ubuntu-latest' - - ruby: "3.1" - os: 'macos-latest' - - ruby: "3.2" - os: 'macos-latest' - env: BUNDLE_WITHOUT: "secryst" SKIP_JS: "1" @@ -48,5 +38,6 @@ jobs: - name: Run RSpecs working-directory: ./ruby run: | + pip install regex bundle install --with=jsexec - bundle exec rspec -f f + bundle exec rspec diff --git a/Gemfile b/Gemfile index 17c184a8..cf4f70d5 100644 --- a/Gemfile +++ b/Gemfile @@ -26,6 +26,12 @@ unless ENV["SKIP_JS"] end end +unless ENV["SKIP_PYTHON"] + group :pyexec do + gem 'pycall' + end +end + group :rababa do gem 'rababa', "~> 0.1.1" end diff --git a/Rakefile b/Rakefile index 8ff0ee32..e5ed34b3 100644 --- a/Rakefile +++ b/Rakefile @@ -14,6 +14,9 @@ task :compile, [:compiler, :target] do |t, args| when "javascript" require "interscript/compiler/javascript" [Interscript::Compiler::Javascript, "js"] + when "python" + require "interscript/compiler/python" + [Interscript::Compiler::Python, "py"] end FileUtils.mkdir_p(args[:target]) @@ -34,7 +37,7 @@ task :compile, [:compiler, :target] do |t, args| File.write(args[:target] + "/" + map + "." + ext, code) end - File.write(args[:target] + "/index.json", maplist.to_json) + File.write(args[:target] + "/index.json", maplist.to_json) if args[:compiler] == "javascript" end task :generate_visualization_html do diff --git a/docs/demo/20191118-interscript-demo-cast.gif b/docs/demo/20191118-interscript-demo-cast.gif index fe881429..a2808c47 100644 Binary files a/docs/demo/20191118-interscript-demo-cast.gif and b/docs/demo/20191118-interscript-demo-cast.gif differ diff --git a/lib/interscript.rb b/lib/interscript.rb index 7ac1ec49..21c9c2fc 100644 --- a/lib/interscript.rb +++ b/lib/interscript.rb @@ -48,9 +48,9 @@ def transliterate_each(system_code, string, maps={}, &block) load(system_code, maps).(string, each: true, &block) end - def transliterate_file(system_code, input_file, output_file, maps={}) + def transliterate_file(system_code, input_file, output_file, maps={}, compiler: Interscript::Interpreter) input = File.read(input_file) - output = transliterate(system_code, input, maps) + output = transliterate(system_code, input, maps, compiler: compiler) File.open(output_file, 'w') do |f| f.puts(output) @@ -125,7 +125,8 @@ def rababa_provision(model_name, model_uri) ([ENV["RABABA_DATA"]] + possible_paths).compact.each do |path| FileUtils.mkdir_p(path) - write_path = path unless write_path + write_path = path + break rescue end @@ -137,8 +138,8 @@ def rababa_provision(model_name, model_uri) if File.exist?(model_path) && File.mtime(model_path) + 3600 >= Time.now return model_path else - data = URI.open(model_uri).read - File.write(model_path, data) + data = URI.open(model_uri, encoding: "BINARY").read + File.binwrite(model_path, data) return model_path end end diff --git a/lib/interscript/command.rb b/lib/interscript/command.rb index 670b8e65..aa017e27 100644 --- a/lib/interscript/command.rb +++ b/lib/interscript/command.rb @@ -8,14 +8,23 @@ class Command < Thor desc '', 'Transliterate text' option :system, aliases: '-s', required: true, desc: 'Transliteration system' option :output, aliases: '-o', required: false, desc: 'Output file' + option :compiler, aliases: '-c', required: false, desc: 'Compiler (eg. Interscript::Compiler::Python)' # Was this option really well thought out? The last parameter is a cache, isn't it? #option :map, aliases: '-m', required: false, default: "{}", desc: 'Transliteration mapping json' def translit(input) + compiler = if options[:compiler] + compiler = options[:compiler].split("::").last.downcase + require "interscript/compiler/#{compiler}" + Object.const_get(options[:compiler]) + else + Interscript::Interpreter + end + if options[:output] - Interscript.transliterate_file(options[:system], input, options[:output]) #, JSON.parse(options[:map])) + Interscript.transliterate_file(options[:system], input, options[:output], compiler: compiler) else - puts Interscript.transliterate(options[:system], IO.read(input)) + puts Interscript.transliterate(options[:system], IO.read(input), compiler: compiler) end end diff --git a/lib/interscript/compiler/python.rb b/lib/interscript/compiler/python.rb new file mode 100644 index 00000000..13722091 --- /dev/null +++ b/lib/interscript/compiler/python.rb @@ -0,0 +1,331 @@ +require 'pycall' + +class Interscript::Compiler::Python < Interscript::Compiler + def escape(val) + case val + when String, Integer + val.inspect + when Symbol + val.to_s.inspect + when Hash + "{"+ + val.map { |k,v| "#{escape k}:#{escape v}" }.join(",")+ + "}" + when Array + "[" + val.map { |i| escape i }.join(",") + "]" + when nil + "None" + else + pp [:error, val] + exit! + end + end + + def re_escape(val) + @pycall_regex ||= PyCall.import_module("regex") + @pycall_regex.escape(val).gsub("\\", "\\\\\\\\").gsub('"', "\\\\\"") + end + + def new_regexp(str) + "re.compile(\"#{str}\", re.MULTILINE)" + end + + def indent + @indent += 4 + yield + @indent -= 4 + end + + def emit(code) + @code << (" " * @indent) << code << "\n" + code + end + + def compile(map, debug: false) + @indent = 0 + @map = map + @debug = debug + @parallel_trees = {} + @parallel_regexps = {} + @code = "" + emit "import interscript" + emit "import regex as re" + map.dependencies.map(&:full_name).each do |dep| + emit "interscript.load_map(#{escape dep})" + end + + emit "interscript.stdlib.define_map(#{escape map.name})" + + map.aliases.each do |name, value| + val = compile_item(value.data, map, :str) + emit "interscript.stdlib.add_map_alias(#{escape map.name}, #{escape name}, #{val})" + val = "\"" + compile_item(value.data, map, :re) + "\"" + emit "interscript.stdlib.add_map_alias_re(#{escape map.name}, #{escape name}, #{val})" + end + + map.stages.each do |_, stage| + compile_rule(stage, @map, true) + end + @parallel_trees.each do |k,v| + emit "_PTREE_#{k} = #{escape v}" + end + @parallel_regexps.each do |k,v| + v = %{["#{v[0]}", #{escape v[1]}]} + emit "_PRE_#{k} = #{v}" + end + end + + def parallel_regexp_compile(subs_hash) + # puts subs_hash.inspect + regexp = subs_hash.each_with_index.map do |p,i| + "(?P<_%d>%s)" % [i,p[0]] + end.join("|") + subs_regexp = regexp + # puts subs_regexp.inspect + end + + def compile_rule(r, map = @map, wrapper = false) + return if r.reverse_run == true + case r + when Interscript::Node::Stage + if @debug + emit "if not hasattr(interscript, 'map_debug'):" + indent { emit "interscript.map_debug = []" } + end + emit "def _stage_#{r.name}(s):" + indent do + r.children.each do |t| + comp = compile_rule(t, map) + emit %{interscript.map_debug.append([s, #{escape @map.name.to_s}, #{escape r.name.to_s}, #{escape t.inspect}, #{escape comp}])} if @debug + end + emit "return s\n" + end + emit "interscript.stdlib.add_map_stage(#{escape @map.name}, #{escape r.name}, _stage_#{r.name})" + when Interscript::Node::Group::Parallel + begin + # Try to build a tree + a = [] + r.children.each do |i| + raise Interscript::SystemConversionError, "Can't parallelize #{i.class}" unless Interscript::Node::Rule::Sub === i + raise Interscript::SystemConversionError, "Can't parallelize rules with :before" if i.before + raise Interscript::SystemConversionError, "Can't parallelize rules with :after" if i.after + raise Interscript::SystemConversionError, "Can't parallelize rules with :not_before" if i.not_before + raise Interscript::SystemConversionError, "Can't parallelize rules with :not_after" if i.not_after + + next if i.reverse_run == true + a << [compile_item(i.from, map, :par), compile_item(i.to, map, :parstr)] + end + ah = a.hash.abs + unless @parallel_trees.include? ah + tree = Interscript::Stdlib.parallel_replace_compile_tree(a) + @parallel_trees[ah] = tree + end + emit "s = interscript.stdlib.parallel_replace_tree(s, _PTREE_#{ah})" + rescue + # Otherwise let's build a megaregexp + a = [] + Interscript::Stdlib.deterministic_sort_by_max_length(r.children).each do |i| + raise Interscript::SystemConversionError, "Can't parallelize #{i.class}" unless Interscript::Node::Rule::Sub === i + + next if i.reverse_run == true + a << [build_regexp(i, map), compile_item(i.to, map, :parstr)] + end + ah = a.hash.abs + unless @parallel_regexps.include? ah + re = parallel_regexp_compile(a) + @parallel_regexps[ah] = [re, a.map(&:last)] + end + emit "s = interscript.stdlib.parallel_regexp_gsub(s, *_PRE_#{ah})" + end + when Interscript::Node::Rule::Sub + from = new_regexp build_regexp(r, map) + if r.to == :upcase + to = 'interscript.stdlib.upper' + elsif r.to == :downcase + to = 'interscript.stdlib.lower' + else + to = compile_item(r.to, map, :str) + end + emit "s = #{from}.sub(#{to}, s)" + when Interscript::Node::Rule::Funcall + emit "s = interscript.functions.#{r.name}(s, #{escape r.kwargs})" + when Interscript::Node::Rule::Run + if r.stage.map + doc = map.dep_aliases[r.stage.map].document + stage = doc.imported_stages[r.stage.name] + else + stage = map.imported_stages[r.stage.name] + end + emit "s = interscript.transliterate(#{escape stage.doc_name}, s, #{escape stage.name})" + else + raise Interscript::SystemConversionError, "Can't compile unhandled #{r.class}" + end + end + + def build_regexp(r, map=@map) + from = compile_item(r.from, map, :re) + before = compile_item(r.before, map, :re) if r.before + after = compile_item(r.after, map, :re) if r.after + not_before = compile_item(r.not_before, map, :re) if r.not_before + not_after = compile_item(r.not_after, map, :re) if r.not_after + + re = "" + re += "(?<=#{before})" if before + re += "(? "?" , + Interscript::Node::Item::Some => "+" , + Interscript::Node::Item::MaybeSome => "*" }[i.class] + + if target == :par + raise Interscript::SystemConversionError, "Can't use a Maybe in a #{target} context" + end + if Interscript::Node::Item::String === i.data && i.data.data.length != 1 + "(?:" + compile_item(i.data, doc, target) + ")" + resuffix + else + compile_item(i.data, doc, target) + resuffix + end + when Interscript::Node::Item::CaptureRef + if target == :par + raise Interscript::SystemConversionError, "Can't use CaptureRef in parallel mode" + elsif target == :re + "\\\\#{i.id}" + elsif target == :str + "\"\\\\#{i.id}\"" + end + when Interscript::Node::Item::Any + if target == :str + raise Interscript::SystemConversionError, "Can't use Any in a string context" # A linter could find this! + elsif target == :par + i.data.map(&:data) + elsif target == :re + case i.value + when Array + data = i.data.map { |j| compile_item(j, doc, target) } + "(?:"+data.join("|")+")" + when String + "[#{re_escape(i.value)}]" + when Range + "[#{re_escape(i.value.first)}-#{re_escape(i.value.last)}]" + end + end + end + end + + @maps_loaded = {} + @ctx = nil + class << self + attr_accessor :maps_loaded + attr_accessor :ctx + end + + def load + if !self.class.maps_loaded[@map.name] + @map.dependencies.each do |dep| + dep = dep.full_name + if !self.class.maps_loaded[dep] + Interscript.load(dep, compiler: self.class).load + end + end + + ctx = self.class.ctx + python_src_path = File.join(__dir__, '..', '..', '..', '..', 'python', 'src') + unless ctx + PyCall.sys.path.append(python_src_path) + self.class.ctx = PyCall.import_module("interscript") + end + #puts @code + Dir.mkdir("#{python_src_path}/interscript/maps") rescue nil + File.write("#{python_src_path}/interscript/maps/#{@map.name}.py", @code) + self.class.ctx.load_map(@map.name) + + self.class.maps_loaded[@map.name] = true + end + end + + def call(str, stage=:main) + load + self.class.ctx.transliterate(@map.name, str, stage.to_s) + end + + def self.read_debug_data + (ctx['map_debug'] || []).map(&:to_a).to_a + end + + def self.reset_debug_data + ctx['map_debug'].clear + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index eabec063..101688ff 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -9,6 +9,7 @@ require "interscript" require "interscript/compiler/ruby" require "interscript/compiler/javascript" unless ENV["SKIP_JS"] +require "interscript/compiler/python" unless ENV["SKIP_PYTHON"] require "interscript/utils/helpers" RSpec.configure do |config| @@ -29,6 +30,7 @@ def each_compiler &block compilers << Interscript::Interpreter compilers << Interscript::Compiler::Ruby compilers << Interscript::Compiler::Javascript unless ENV["SKIP_JS"] + compilers << Interscript::Compiler::Python unless ENV["SKIP_PYTHON"] compilers.each do |compiler| block.(compiler)