diff --git a/documents/how_to_add_new_language.md b/documents/how_to_add_new_language.md index b0ff36d3..5813dfb7 100644 --- a/documents/how_to_add_new_language.md +++ b/documents/how_to_add_new_language.md @@ -10,6 +10,9 @@ The following steps with help you identify files that need to be added or change NOTE: Take a look at [PR #40](https://github.com/unicode-org/inflection/pull/40) and [PR #111](https://github.com/unicode-org/inflection/pull/111) for example on how to add initial language support based on dictionary lookup only. In general, to bootstrap your progress look for grammatically similar language that's already supported, e.g. if you are adding Serbian look for existing Russian implementation. This will help you find most of the files you need to add/change and will speed up implementation of the rules and lexicons. +We recommend you spend around a week researching the language and all the different components of the language before even beginning to modify and add the files below. Look at all the files in the project such as tokenizers, configuration files, grammar files, and different lookup functions to see what you need. This will save you a lot of time in the end. We highly suggest you stray away from hardcoded logic and rely on the Dictionary Lookup. Look at all the grammemes, tokenizer logic, and multi-word phrase handling. + +Before you add new language support, go to the README.md in the inflection subfolder (inflection/inflection/README.md), build the project, and make sure all the tests run on your computer. ## Mark your language as supported * UPDATE: inflection/src/inflection/util/LocaleUtils.hpp @@ -29,13 +32,13 @@ TODO: We need to expand what each of these do. * ADD: inflection/src/inflection/grammar/synthesis/*Xx*GrammarSynthesizer.hpp * ADD: inflection/src/inflection/grammar/synthesis/*Xx*GrammarSynthesizer.cpp * ADD: inflection/src/inflection/grammar/synthesis/*Xx*GrammarSynthesizer_*Xx*DisplayFunction.hpp -* ADD: inflection/src/inflection/grammar/synthesis/*Xx*GrammarSynthesizer_*Xx*DisplayFunction.hpp +* ADD: inflection/src/inflection/grammar/synthesis/*Xx*GrammarSynthesizer_*Xx*DisplayFunction.cpp * UPDATE: inflection/src/inflection/grammar/synthesis/GrammarSynthesizerFactory.cpp * UPDATE: inflection/src/inflection/grammar/synthesis/fwd.hpp ## Add language specific properties for lists, quantities and related topics * ADD: inflection/src/inflection/dialog/language/*Xx*CommonConceptFactory.hpp -* ADD: inflection/src/inflection/dialog/language/*Xx*CommonConceptFactory.hpp +* ADD: inflection/src/inflection/dialog/language/*Xx*CommonConceptFactory.cpp * UPDATE: inflection/src/inflection/dialog/language/fwd.hpp ## Define and create lexion diff --git a/inflection/resources/org/unicode/inflection/dictionary/.gitattributes b/inflection/resources/org/unicode/inflection/dictionary/.gitattributes index fe535cd8..9dc3b5af 100644 --- a/inflection/resources/org/unicode/inflection/dictionary/.gitattributes +++ b/inflection/resources/org/unicode/inflection/dictionary/.gitattributes @@ -8,6 +8,7 @@ dictionary_he.lst filter=lfs diff=lfs merge=lfs -text dictionary_hi.lst filter=lfs diff=lfs merge=lfs -text dictionary_it.lst filter=lfs diff=lfs merge=lfs -text dictionary_ko.lst filter=lfs diff=lfs merge=lfs -text +dictionary_ml.lst filter=lfs diff=lfs merge=lfs -text dictionary_nb.lst filter=lfs diff=lfs merge=lfs -text dictionary_nl.lst filter=lfs diff=lfs merge=lfs -text dictionary_pt.lst filter=lfs diff=lfs merge=lfs -text @@ -23,6 +24,7 @@ inflectional_fr.xml filter=lfs diff=lfs merge=lfs -text inflectional_he.xml filter=lfs diff=lfs merge=lfs -text inflectional_hi.xml filter=lfs diff=lfs merge=lfs -text inflectional_it.xml filter=lfs diff=lfs merge=lfs -text +inflectional_ml.xml filter=lfs diff=lfs merge=lfs -text inflectional_nb.xml filter=lfs diff=lfs merge=lfs -text inflectional_nl.xml filter=lfs diff=lfs merge=lfs -text inflectional_pt.xml filter=lfs diff=lfs merge=lfs -text diff --git a/inflection/resources/org/unicode/inflection/dictionary/dictionary_ml.lst b/inflection/resources/org/unicode/inflection/dictionary/dictionary_ml.lst new file mode 100644 index 00000000..320b3589 --- /dev/null +++ b/inflection/resources/org/unicode/inflection/dictionary/dictionary_ml.lst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bda9371a2aa17c08328381e678b77e769269f4ee74749dd4f9e0bd5890cf59c +size 53958746 diff --git a/inflection/resources/org/unicode/inflection/dictionary/inflectional_ml.xml b/inflection/resources/org/unicode/inflection/dictionary/inflectional_ml.xml new file mode 100644 index 00000000..d6be1b3b --- /dev/null +++ b/inflection/resources/org/unicode/inflection/dictionary/inflectional_ml.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1868dab352ff2648c2ba495bc08a3877409eadf177f573817fd03ae07174b12f +size 613479 diff --git a/inflection/resources/org/unicode/inflection/features/grammar.xml b/inflection/resources/org/unicode/inflection/features/grammar.xml index 6a620220..f7b60a2a 100644 --- a/inflection/resources/org/unicode/inflection/features/grammar.xml +++ b/inflection/resources/org/unicode/inflection/features/grammar.xml @@ -1624,6 +1624,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/inflection/resources/org/unicode/inflection/inflection/pronoun_ml.csv b/inflection/resources/org/unicode/inflection/inflection/pronoun_ml.csv new file mode 100644 index 00000000..5134f9da --- /dev/null +++ b/inflection/resources/org/unicode/inflection/inflection/pronoun_ml.csv @@ -0,0 +1,75 @@ +അവൻ,third,singular,nominative,masculine +അവൾ,third,singular,nominative,feminine +അത്,third,singular,nominative,neuter +അവനെ,third,singular,accusative,masculine +അവന്റെ,third,singular,genitive,masculine,determination=dependent +അവന്റെത്,third,singular,genitive,masculine,determination=independent +അവളെ,third,singular,accusative,feminine +അവളുടെ,third,singular,genitive,feminine,determination=dependent +അവളുടേതു്,third,singular,genitive,feminine,determination=independent +അതിനെ,third,singular,accusative,neuter +അതിന്റെ,third,singular,genitive,neuter,determination=dependent +അതിന്റേതു്,third,singular,genitive,neuter,determination=independent +അവനിൽ,third,singular,locative,masculine +അവനാൽ,third,singular,instrumental,masculine +അവനോടു്,third,singular,sociative,masculine +അവളിൽ,third,singular,locative,feminine +അവളാൽ,third,singular,instrumental,feminine +അവളോടു്,third,singular,sociative,feminine +അതിൽ,third,singular,locative,neuter +അതാൽ,third,singular,instrumental,neuter +അതോടു്,third,singular,sociative,neuter +അവർ,third,plural,nominative +അവരെ,third,plural,accusative +അവരുടെ,third,plural,genitive,determination=dependent +അവരുടേതു്,third,plural,genitive,determination=independent +അവരിൽ,third,plural,locative +അവരാൽ,third,plural,instrumental +അവരോടു്,third,plural,sociative +നീ,second,singular,nominative,informal +താങ്കൾ,second,singular,nominative,formal +നിനക്ക്,second,singular,dative,informal +താങ്കൾക്ക്,second,singular,dative,formal +നിനെ,second,singular,accusative,informal +താങ്കളെ,second,singular,accusative,formal +നിന്റെ,second,singular,genitive,informal,determination=dependent +നിന്റേതു്,second,singular,genitive,informal,determination=independent +താങ്കളുടെ,second,singular,genitive,formal,determination=dependent +താങ്കളുടേതു്,second,singular,genitive,formal,determination=independent +നിനിൽ,second,singular,locative,informal +നിനാൽ,second,singular,instrumental,informal +നിനോടു്,second,singular,sociative,informal +താങ്കളിൽ,second,singular,locative,formal +താങ്കളാൽ,second,singular,instrumental,formal +താങ്കളോടു്,second,singular,sociative,formal +നിങ്ങൾ,second,plural,nominative,formal +നിങ്ങളെ,second,plural,accusative,formal +നിങ്ങൾക്ക്,second,plural,dative,formal +നിങ്ങളുടെ,second,plural,genitive,formal,determination=dependent +നിങ്ങളുടേതു്,second,plural,genitive,formal,determination=independent +നിങ്ങളിൽ,second,plural,locative,formal +നിങ്ങളാൽ,second,plural,instrumental,formal +നിങ്ങളോടു്,second,plural,sociative,formal +ഞാൻ,first,singular,nominative,exclusive +എനിക്ക്,first,singular,dative +നമുക്ക്,first,plural,dative,inclusive +എന്നെ,first,singular,accusative,exclusive +നമ്മെ,first,plural,accusative,inclusive +എന്റെ,first,singular,genitive,determination=dependent,exclusive +എന്റേത്,first,singular,genitive,determination=independent,exclusive +എന്നിൽ,first,singular,locative +എന്നാൽ,first,singular,instrumental +എന്നോടു്,first,singular,sociative +ഞങ്ങൾ,first,plural,nominative,exclusive +നാം,first,plural,nominative,inclusive +ഞങ്ങളെ,first,plural,accusative,exclusive +ഞങ്ങൾക്ക്,first,plural,dative,exclusive +ഞങ്ങളുടെ,first,plural,genitive,exclusive,determination=dependent +ഞങ്ങളുടേത്,first,plural,genitive,exclusive,determination=independent +നമ്മുടെ,first,plural,genitive,inclusive,determination=dependent +നമ്മുടേതു്,first,plural,genitive,inclusive,determination=independent +ഞങ്ങളിലു്,first,plural,locative,exclusive +ഞങ്ങളാൽ,first,plural,instrumental,exclusive +ഞങ്ങളോടു്,first,plural,sociative,exclusive +താൻ,third,singular,nominative,reflexive +തങ്ങൾ,third,plural,nominative,formal,reflexive \ No newline at end of file diff --git a/inflection/resources/org/unicode/inflection/locale/supported-locales.properties b/inflection/resources/org/unicode/inflection/locale/supported-locales.properties index 6815591d..43741bca 100644 --- a/inflection/resources/org/unicode/inflection/locale/supported-locales.properties +++ b/inflection/resources/org/unicode/inflection/locale/supported-locales.properties @@ -15,6 +15,7 @@ locale.group.it=it_IT,it_CH locale.group.ja=ja_JP locale.group.ko=ko_KR locale.group.ms=ms_MY +locale.group.ml=ml_IN locale.group.nb=nb_NO locale.group.nl=nl_NL,nl_BE locale.group.pt=pt_BR,pt_PT diff --git a/inflection/resources/org/unicode/inflection/tokenizer/config_ml.properties b/inflection/resources/org/unicode/inflection/tokenizer/config_ml.properties new file mode 100644 index 00000000..d9652dc0 --- /dev/null +++ b/inflection/resources/org/unicode/inflection/tokenizer/config_ml.properties @@ -0,0 +1,7 @@ +# +# Copyright 2025 Unicode Incorporated and others. All rights reserved. +# +tokenizer.implementation.class=DefaultTokenizer +tokenizer.nonDecompound.file=/org/unicode/inflection/tokenizer/ml/nondecompound.tok +tokenizer.decompound=(ശ്രീ)(.+?)(ഗുരു|സര്‍ക്കാര്‍)|(.+?)(ഗുരു|സര്‍ക്കാര്‍|ഉണ്ട്|ആണ്|ഇല്ല|ഒടൊപ്പം|ഉടൻ|ഓടെ|ഓട്|ഒപ്പം|തന്നെ|പോലും|പോലെ|ഉം|യ്|കളുടെ|ങ്ങളുടെ|ത്തിന്റെ|ൻ്റെ|ന്റെ|യുടേ|യുടെ|യാൽ|യിൽ|ഇൽ|ല്|ൽ|ക്ക്|മാർ|ങ്ങൾ|കൾ|നെ|യെ) + diff --git a/inflection/resources/org/unicode/inflection/tokenizer/ml/nondecompound.tok b/inflection/resources/org/unicode/inflection/tokenizer/ml/nondecompound.tok new file mode 100644 index 00000000..c62b299c --- /dev/null +++ b/inflection/resources/org/unicode/inflection/tokenizer/ml/nondecompound.tok @@ -0,0 +1,35 @@ +അമ്മ +അച്ഛൻ +അച്ഛി +അമ്മൻ +മകൻ +മകൾ +കുട്ടി +കുട്ടികൾ +ആൺകുട്ടി +ആൺകുട്ടികൾ +പെൺകുട്ടി +പെൺകുട്ടികൾ +കഥ +ചിത്രം +ചിത്രങ്ങൾ +ഗ്രന്ഥം +ഗ്രന്ഥങ്ങൾ +മക്കൾ +ഞാൻ +നീ +നിങ്ങൾ +അവൻ +അവൾ +അവ +അവർ +ഇത് +അത് +ഇവ +അവ +ശ്രീ +നാരായണ +ഗുരു +കേരളം +സര്‍ക്കാര്‍ +കേരളസര്‍ക്കാര്‍ diff --git a/inflection/src/inflection/dialog/PronounConcept.cpp b/inflection/src/inflection/dialog/PronounConcept.cpp index 5ced24eb..dfd5ccc2 100644 --- a/inflection/src/inflection/dialog/PronounConcept.cpp +++ b/inflection/src/inflection/dialog/PronounConcept.cpp @@ -228,7 +228,7 @@ PronounConcept::PronounConcept(const SemanticFeatureModel& model, std::u16string for (int32_t idx = 0; idx < pronounData->numValues(); idx++) { const auto& pronounEntry = pronounData->getPronounEntry(idx); std::u16string_view displayString(pronounEntry.first); - if (displayString.back() == u' ') { + if (!displayString.empty() && displayString.back() == u' ') { displayString.remove_suffix(1); } auto status = U_ZERO_ERROR; diff --git a/inflection/src/inflection/dialog/language/MlCommonConceptFactory.cpp b/inflection/src/inflection/dialog/language/MlCommonConceptFactory.cpp new file mode 100644 index 00000000..e426c9e4 --- /dev/null +++ b/inflection/src/inflection/dialog/language/MlCommonConceptFactory.cpp @@ -0,0 +1,25 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + */ + +#include +#include +#include + +namespace inflection::dialog::language { + +// In Malayalam, numbers generally follow the noun +::inflection::dialog::SpeakableString +MlCommonConceptFactory::quantifiedJoin(const ::inflection::dialog::SpeakableString& formattedNumber, + const ::inflection::dialog::SpeakableString& nounPhrase, + const ::std::u16string& /*measureWord*/, + Plurality::Rule countType) const +{ + ::inflection::dialog::SpeakableString space(u" "); + if (countType == Plurality::Rule::ONE) { + return nounPhrase + space + formattedNumber; + } + return formattedNumber + space + nounPhrase; +} + +} // namespace inflection::dialog::language diff --git a/inflection/src/inflection/dialog/language/MlCommonConceptFactory.hpp b/inflection/src/inflection/dialog/language/MlCommonConceptFactory.hpp new file mode 100644 index 00000000..7bfab1dd --- /dev/null +++ b/inflection/src/inflection/dialog/language/MlCommonConceptFactory.hpp @@ -0,0 +1,28 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + */ +#pragma once + +#include +#include +#include +#include + +namespace inflection::dialog::language { + +class MlCommonConceptFactory : public CommonConceptFactoryImpl { + using super = CommonConceptFactoryImpl; + +public: + explicit MlCommonConceptFactory(const ::inflection::util::ULocale& language); + ~MlCommonConceptFactory() override; + +protected: + ::inflection::dialog::SpeakableString quantifiedJoin( + const ::inflection::dialog::SpeakableString& formattedNumber, + const ::inflection::dialog::SpeakableString& nounPhrase, + const ::std::u16string& measureWord, + ::inflection::dialog::Plurality::Rule countType) const override; +}; + +} // namespace inflection::dialog::language diff --git a/inflection/src/inflection/dialog/language/fwd.hpp b/inflection/src/inflection/dialog/language/fwd.hpp index 6429ca3a..e952df27 100644 --- a/inflection/src/inflection/dialog/language/fwd.hpp +++ b/inflection/src/inflection/dialog/language/fwd.hpp @@ -1,4 +1,5 @@ /* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. * Copyright 2017-2024 Apple Inc. All rights reserved. */ // Forward declarations for inflection.dialog.language @@ -28,6 +29,7 @@ namespace inflection class JaCommonConceptFactory; class KoCommonConceptFactory; class KoCommonConceptFactory_KoAndList; + class MlCommonConceptFactory; class MsCommonConceptFactory; class NbCommonConceptFactory; class NlCommonConceptFactory; diff --git a/inflection/src/inflection/grammar/synthesis/GrammarSynthesizerFactory.cpp b/inflection/src/inflection/grammar/synthesis/GrammarSynthesizerFactory.cpp index ecb31303..242101af 100644 --- a/inflection/src/inflection/grammar/synthesis/GrammarSynthesizerFactory.cpp +++ b/inflection/src/inflection/grammar/synthesis/GrammarSynthesizerFactory.cpp @@ -1,4 +1,5 @@ /* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. * Copyright 2017-2024 Apple Inc. All rights reserved. */ #include @@ -13,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +43,7 @@ static const ::std::map<::inflection::util::ULocale, addSemanticFeatures>& GRAMM {::inflection::util::LocaleUtils::HINDI(), &HiGrammarSynthesizer::addSemanticFeatures}, {::inflection::util::LocaleUtils::ITALIAN(), &ItGrammarSynthesizer::addSemanticFeatures}, {::inflection::util::LocaleUtils::KOREAN(), &KoGrammarSynthesizer::addSemanticFeatures}, + {::inflection::util::LocaleUtils::MALAYALAM(), &MlGrammarSynthesizer::addSemanticFeatures}, {::inflection::util::LocaleUtils::NORWEGIAN(), &NbGrammarSynthesizer::addSemanticFeatures}, {::inflection::util::LocaleUtils::DUTCH(), &NlGrammarSynthesizer::addSemanticFeatures}, {::inflection::util::LocaleUtils::PORTUGUESE(), &PtGrammarSynthesizer::addSemanticFeatures}, diff --git a/inflection/src/inflection/grammar/synthesis/GrammemeConstants.cpp b/inflection/src/inflection/grammar/synthesis/GrammemeConstants.cpp index 52d1f31f..9fa24c65 100644 --- a/inflection/src/inflection/grammar/synthesis/GrammemeConstants.cpp +++ b/inflection/src/inflection/grammar/synthesis/GrammemeConstants.cpp @@ -159,6 +159,12 @@ const ::std::u16string& GrammemeConstants::CASE_PREPOSITIONAL() return *npc(CASE_PREPOSITIONAL_); } +const ::std::u16string& GrammemeConstants::CASE_SOCIATIVE() +{ + static auto CASE_SOCIATIVE_ = new ::std::u16string(u"sociative"); + return *npc(CASE_SOCIATIVE_); +} + const ::std::u16string& GrammemeConstants::CASE_TRANSLATIVE() { static auto CASE_TRANSLATIVE_ = new ::std::u16string(u"translative"); @@ -279,6 +285,12 @@ const ::std::u16string& GrammemeConstants::MOOD_INDICATIVE() return *npc(MOOD_INDICATIVE_); } +const ::std::u16string& GrammemeConstants::MOOD_SUBJUNCTIVE() +{ + static auto MOOD_SUBJUNCTIVE_ = new ::std::u16string(u"subjunctive"); + return *npc(MOOD_SUBJUNCTIVE_); +} + const ::std::u16string& GrammemeConstants::TENSE_PAST() { static auto TENSE_PAST_ = new ::std::u16string(u"past"); diff --git a/inflection/src/inflection/grammar/synthesis/GrammemeConstants.hpp b/inflection/src/inflection/grammar/synthesis/GrammemeConstants.hpp index 4010c1ba..60cc2133 100644 --- a/inflection/src/inflection/grammar/synthesis/GrammemeConstants.hpp +++ b/inflection/src/inflection/grammar/synthesis/GrammemeConstants.hpp @@ -42,6 +42,7 @@ class inflection::grammar::synthesis::GrammemeConstants final static const ::std::u16string& CASE_OBLIQUE(); static const ::std::u16string& CASE_PARTITIVE(); static const ::std::u16string& CASE_PREPOSITIONAL(); + static const ::std::u16string& CASE_SOCIATIVE(); static const ::std::u16string& CASE_TRANSLATIVE(); static const ::std::u16string& CASE_VOCATIVE(); @@ -75,6 +76,7 @@ class inflection::grammar::synthesis::GrammemeConstants final static constexpr auto MOOD = u"mood"; static const ::std::u16string& MOOD_IMPERATIVE(); static const ::std::u16string& MOOD_INDICATIVE(); + static const ::std::u16string& MOOD_SUBJUNCTIVE(); static constexpr auto TENSE = u"tense"; static const ::std::u16string& TENSE_PAST(); diff --git a/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer.cpp b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer.cpp new file mode 100644 index 00000000..abcc7b65 --- /dev/null +++ b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer.cpp @@ -0,0 +1,182 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace inflection::grammar::synthesis { + +void MlGrammarSynthesizer::addSemanticFeatures(::inflection::dialog::SemanticFeatureModel& featureModel) +{ + featureModel.putDefaultFeatureFunctionByName(GrammemeConstants::NUMBER, new MlGrammarSynthesizer_NumberLookupFunction()); + featureModel.putDefaultFeatureFunctionByName(GrammemeConstants::GENDER, new MlGrammarSynthesizer_GenderLookupFunction()); + featureModel.putDefaultFeatureFunctionByName(GrammemeConstants::CASE, new MlGrammarSynthesizer_CaseLookupFunction()); + + featureModel.setDefaultDisplayFunction(new MlGrammarSynthesizer_MlDisplayFunction(featureModel)); +} + +MlGrammarSynthesizer::Number MlGrammarSynthesizer::getNumber(const ::std::u16string& value) { + if (value == GrammemeConstants::NUMBER_SINGULAR()) { + return Number::singular; + } + if (value == GrammemeConstants::NUMBER_PLURAL()) { + return Number::plural; + } + return Number::undefined; +} + +MlGrammarSynthesizer::Case MlGrammarSynthesizer::getCase(const ::std::u16string& value) { + if (value == GrammemeConstants::CASE_NOMINATIVE()) { + return Case::nominative; + } + if (value == GrammemeConstants::CASE_ACCUSATIVE()) { + return Case::accusative; + } + if (value == GrammemeConstants::CASE_DATIVE()) { + return Case::dative; + } + if (value == GrammemeConstants::CASE_GENITIVE()) { + return Case::genitive; + } + if (value == GrammemeConstants::CASE_INSTRUMENTAL()) { + return Case::instrumental; + } + if (value == GrammemeConstants::CASE_LOCATIVE()) { + return Case::locative; + } + return Case::undefined; +} + +MlGrammarSynthesizer::Person MlGrammarSynthesizer::getPerson(const ::std::u16string& value) { + if (value == GrammemeConstants::PERSON_FIRST()) { + return Person::first; + } + if (value == GrammemeConstants::PERSON_SECOND()) { + return Person::second; + } + if (value == GrammemeConstants::PERSON_THIRD()) { + return Person::third; + } + return Person::undefined; +} + +MlGrammarSynthesizer::Tense MlGrammarSynthesizer::getTense(const ::std::u16string& value) { + if (value == GrammemeConstants::TENSE_PAST()) { + return Tense::past; + } + if (value == GrammemeConstants::TENSE_PRESENT()) { + return Tense::present; + } + if (value == GrammemeConstants::TENSE_FUTURE()) { + return Tense::future; + } + return Tense::undefined; +} + +MlGrammarSynthesizer::Mood MlGrammarSynthesizer::getMood(const ::std::u16string& value) { + if (value == GrammemeConstants::MOOD_INDICATIVE()) { + return Mood::indicative; + } + if (value == GrammemeConstants::MOOD_IMPERATIVE()) { + return Mood::imperative; + } + if (value == GrammemeConstants::MOOD_SUBJUNCTIVE()) { + return Mood::subjunctive; + } + return Mood::undefined; +} + +MlGrammarSynthesizer::LookupKey MlGrammarSynthesizer::makeLookupKey(Number num, Case kase) { + return (static_cast(kase) & 0xFF) + | ((static_cast(num) & 0xFF) << 8); +} + +MlGrammarSynthesizer::LookupKey MlGrammarSynthesizer::makeVerbLookupKey(Person person, Number num, Tense tense, Mood mood) { + return (static_cast(person) & 0xFF) + | ((static_cast(num) & 0xFF) << 8) + | ((static_cast(tense) & 0x0F) << 24) + | ((static_cast(mood) & 0x0F) << 28); +} + +MlGrammarSynthesizer::LookupKey MlGrammarSynthesizer::buildVerbSuffixKey(const std::vector<::std::u16string>& constraintValues) { + Person person = Person::undefined; + Number num = Number::undefined; + Tense tense = Tense::undefined; + Mood mood = Mood::undefined; + + for (const auto& val : constraintValues) { + if (person == Person::undefined) { + person = getPerson(val); + } + if (num == Number::undefined) { + num = getNumber(val); + } + if (tense == Tense::undefined) { + tense = getTense(val); + } + if (mood == Mood::undefined) { + mood = getMood(val); + } + } + + return makeVerbLookupKey(person, num, tense, mood); +} + +const std::map& MlGrammarSynthesizer::MALAYALAM_SUFFIX_MAP() +{ + static auto MALAYALAM_SUFFIX_MAP_ = new ::std::map({ + {makeLookupKey(Number::singular, Case::nominative), u""}, + {makeLookupKey(Number::plural, Case::nominative), u"കൾ"}, + {makeLookupKey(Number::singular, Case::genitive), u"യുടെ"}, + {makeLookupKey(Number::plural, Case::genitive), u"കളുടെ"}, + {makeLookupKey(Number::singular, Case::dative), u"ക്ക്"}, + {makeLookupKey(Number::plural, Case::dative), u"കൾക്ക്"}, + }); + return *npc(MALAYALAM_SUFFIX_MAP_); +}; + +const std::map& MlGrammarSynthesizer::MALAYALAM_VERB_SUFFIX_MAP() +{ + static auto MALAYALAM_VERB_SUFFIX_MAP_ = new ::std::map({ + {makeVerbLookupKey(Person::first, Number::singular, Tense::past, Mood::indicative), u"ച്ചു"}, + {makeVerbLookupKey(Person::first, Number::plural, Tense::past, Mood::indicative), u"ഞ്ഞു"}, + {makeVerbLookupKey(Person::second, Number::singular, Tense::past, Mood::indicative), u"ച്ചു"}, + {makeVerbLookupKey(Person::second, Number::plural, Tense::past, Mood::indicative), u"ന്നു"}, + {makeVerbLookupKey(Person::third, Number::singular, Tense::past, Mood::indicative), u"ച്ചു"}, + {makeVerbLookupKey(Person::third, Number::plural, Tense::past, Mood::indicative), u"ന്നു"}, + + {makeVerbLookupKey(Person::first, Number::singular, Tense::present, Mood::indicative), u"ിക്കുന്നു"}, + {makeVerbLookupKey(Person::first, Number::plural, Tense::present, Mood::indicative), u"ിക്കുന്നു"}, + {makeVerbLookupKey(Person::second, Number::singular, Tense::present, Mood::indicative), u"ിക്കുന്നു"}, + {makeVerbLookupKey(Person::second, Number::plural, Tense::present, Mood::indicative), u"ിക്കുന്നു"}, + {makeVerbLookupKey(Person::third, Number::singular, Tense::present, Mood::indicative), u"ിക്കുന്നു"}, + {makeVerbLookupKey(Person::third, Number::plural, Tense::present, Mood::indicative), u"ിക്കുന്നു"}, + + {makeVerbLookupKey(Person::first, Number::singular, Tense::future, Mood::indicative), u" ചെയ്യും"}, + {makeVerbLookupKey(Person::first, Number::plural, Tense::future, Mood::indicative), u" ചെയ്യും"}, + {makeVerbLookupKey(Person::second, Number::singular, Tense::future, Mood::indicative), u" ചെയ്യും"}, + {makeVerbLookupKey(Person::second, Number::plural, Tense::future, Mood::indicative), u" ചെയ്യും"}, + {makeVerbLookupKey(Person::third, Number::singular, Tense::future, Mood::indicative), u" ചെയ്യും"}, + {makeVerbLookupKey(Person::third, Number::plural, Tense::future, Mood::indicative), u" ചെയ്യും"}, + }); + return *npc(MALAYALAM_VERB_SUFFIX_MAP_); +} + +std::u16string_view MlGrammarSynthesizer::getSuffix(LookupKey key) { + auto it = MALAYALAM_VERB_SUFFIX_MAP().find(key); + return it != MALAYALAM_VERB_SUFFIX_MAP().end() ? it->second : std::u16string_view(); +} + +std::u16string_view MlGrammarSynthesizer::getVerbSuffix(LookupKey key) { + auto it = MALAYALAM_SUFFIX_MAP().find(key); + return it != MALAYALAM_SUFFIX_MAP().end() ? it->second : std::u16string_view(); +} + +} // namespace inflection::grammar::synthesis diff --git a/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer.hpp b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer.hpp new file mode 100644 index 00000000..46427c86 --- /dev/null +++ b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer.hpp @@ -0,0 +1,76 @@ +/* +* Copyright 2025 Unicode Incorporated and others. All rights reserved. +*/ +#pragma once + +#include +#include +#include +#include +#include +#include + +class inflection::grammar::synthesis::MlGrammarSynthesizer final +{ +public: + static void addSemanticFeatures(::inflection::dialog::SemanticFeatureModel& model); + + enum class Number { + undefined, + singular, + plural + }; + static Number getNumber(const ::std::u16string& value); + + enum class Case { + undefined, + nominative, + accusative, + dative, + genitive, + instrumental, + locative + }; + static Case getCase(const ::std::u16string& value); + + enum class Person { + undefined, + first, + second, + third + }; + static Person getPerson(const ::std::u16string& value); + + enum class Tense { + undefined, + past, + present, + future + }; + static Tense getTense(const ::std::u16string& value); + + enum class Mood { + undefined, + indicative, + imperative, + subjunctive + }; + static Mood getMood(const ::std::u16string& value); + + typedef uint32_t LookupKey; + static LookupKey makeLookupKey(Number num, Case kase); + static LookupKey makeVerbLookupKey(Person person, Number num, Tense tense, Mood mood); + + static LookupKey buildVerbSuffixKey(const std::vector<::std::u16string>& constraintValues); + +private: + static const std::map& MALAYALAM_SUFFIX_MAP(); + static const std::map& MALAYALAM_VERB_SUFFIX_MAP(); + +public: + static ::std::u16string_view getSuffix(LookupKey key); + static ::std::u16string_view getVerbSuffix(LookupKey key); + +private: + MlGrammarSynthesizer() = delete; +}; diff --git a/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_CaseLookupFunction.cpp b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_CaseLookupFunction.cpp new file mode 100644 index 00000000..cb354305 --- /dev/null +++ b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_CaseLookupFunction.cpp @@ -0,0 +1,50 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + */ +#include "MlGrammarSynthesizer_CaseLookupFunction.hpp" + +#include +#include +#include +#include + +namespace inflection::grammar::synthesis { + +// Constructor: initialize the member table here (no static locals). +MlGrammarSynthesizer_CaseLookupFunction::MlGrammarSynthesizer_CaseLookupFunction() + : m_suffixToCase_{ + { u"ന്റെ", GrammemeConstants::CASE_GENITIVE() }, + { u"യുടെ", GrammemeConstants::CASE_GENITIVE() }, + { u"ഉടെ", GrammemeConstants::CASE_GENITIVE() }, + { u"ആയുടെ", GrammemeConstants::CASE_GENITIVE() }, + { u"ഉടേതു്", GrammemeConstants::CASE_GENITIVE() }, + { u"ഉടേതു", GrammemeConstants::CASE_GENITIVE() }, + { u"ഉടെത്", GrammemeConstants::CASE_GENITIVE() }, + { u"നെ", GrammemeConstants::CASE_ACCUSATIVE() }, + { u"ക്ക്", GrammemeConstants::CASE_DATIVE() }, + { u"യ്ക്ക്", GrammemeConstants::CASE_DATIVE() }, + { u"യിൽ", GrammemeConstants::CASE_LOCATIVE() }, + { u"ഇൽ", GrammemeConstants::CASE_LOCATIVE() }, + { u"ആൽ", GrammemeConstants::CASE_INSTRUMENTAL() }, + { u"വഴി", GrammemeConstants::CASE_INSTRUMENTAL() }, + { u"ഓടെ", GrammemeConstants::CASE_SOCIATIVE() } + } +{ +} + +::inflection::dialog::SpeakableString* MlGrammarSynthesizer_CaseLookupFunction::getFeatureValue( + const ::inflection::dialog::DisplayValue& displayValue, + const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& /*constraints*/) const +{ + const std::u16string& displayString(displayValue.getDisplayString()); + + for (const auto& [suffix, caseGrammeme] : m_suffixToCase_) { + if (displayString.ends_with(suffix)) { + return new ::inflection::dialog::SpeakableString(caseGrammeme); + } + } + + return nullptr; +} + +} // namespace inflection::grammar::synthesis \ No newline at end of file diff --git a/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_CaseLookupFunction.hpp b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_CaseLookupFunction.hpp new file mode 100644 index 00000000..bacc176b --- /dev/null +++ b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_CaseLookupFunction.hpp @@ -0,0 +1,39 @@ +/* +* Copyright 2025 Unicode Incorporated and others. All rights reserved. +*/ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +namespace inflection::grammar::synthesis { + +class MlGrammarSynthesizer_CaseLookupFunction + : public ::inflection::dialog::DefaultFeatureFunction +{ +public: + typedef ::inflection::dialog::DefaultFeatureFunction super; + +public: + MlGrammarSynthesizer_CaseLookupFunction(); + ~MlGrammarSynthesizer_CaseLookupFunction() override = default; + + MlGrammarSynthesizer_CaseLookupFunction(const MlGrammarSynthesizer_CaseLookupFunction&) = delete; + MlGrammarSynthesizer_CaseLookupFunction& operator=(const MlGrammarSynthesizer_CaseLookupFunction&) = delete; + + ::inflection::dialog::SpeakableString* getFeatureValue( + const ::inflection::dialog::DisplayValue& displayValue, + const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints) const override; + +private: + const std::vector> m_suffixToCase_; +}; + +} // namespace inflection::grammar::synthesis \ No newline at end of file diff --git a/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_GenderLookupFunction.cpp b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_GenderLookupFunction.cpp new file mode 100644 index 00000000..de8992a9 --- /dev/null +++ b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_GenderLookupFunction.cpp @@ -0,0 +1,138 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace inflection::grammar::synthesis { + +MlGrammarSynthesizer_GenderLookupFunction::MlGrammarSynthesizer_GenderLookupFunction() + : super(::inflection::util::LocaleUtils::MALAYALAM(), + {GrammemeConstants::GENDER_MASCULINE(), + GrammemeConstants::GENDER_FEMININE(), + GrammemeConstants::GENDER_NEUTER()}, + {GrammemeConstants::POS_NOUN(), + GrammemeConstants::POS_PRONOUN()}) + , tokenizer(npc(::inflection::tokenizer::TokenizerFactory::createTokenizer(::inflection::util::LocaleUtils::MALAYALAM()))) + , dictionary(getDictionary()) +{ + ::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&nounProperty, {u"noun"})); +} + +MlGrammarSynthesizer_GenderLookupFunction::~MlGrammarSynthesizer_GenderLookupFunction() +{ +} + +static const ::std::set<::std::u16string_view>& FEMININE_SUFFIXES() +{ + static auto FEMININE_SUFFIXES_ = new ::std::set<::std::u16string_view>({ + u"ി" // e.g. പെൺ (pen) endings + , u"ാളി" // common feminine suffix in Malayalam nouns + }); + return *npc(FEMININE_SUFFIXES_); +} + +static const ::std::set<::std::u16string_view>& MASCULINE_SUFFIXES() +{ + static auto MASCULINE_SUFFIXES_ = new ::std::set<::std::u16string_view>({ + u"ൻ" // e.g. ആൾ (person) endings + , u"ർ" // common masculine suffix in Malayalam nouns + }); + return *npc(MASCULINE_SUFFIXES_); +} + +static const ::std::set<::std::u16string_view>& NEUTER_SUFFIXES() +{ + static auto NEUTER_SUFFIXES_ = new ::std::set<::std::u16string_view>({ + u"ത്", + u"ം", + u"യം" + }); + return *npc(NEUTER_SUFFIXES_); +} + +::std::u16string MlGrammarSynthesizer_GenderLookupFunction::determine(const ::std::u16string& word) const +{ + if (word.empty()) { + return {}; + } + + auto gender = super::determine(word); + if (gender.empty()) { + ::std::unique_ptr<::inflection::tokenizer::TokenChain> tokenChain(npc(tokenizer->createTokenChain(word))); + + // First try dictionary lookup on noun tokens + for (auto token = tokenChain->begin(); token != tokenChain->end(); ++token) { + if (dynamic_cast(token.get()) != nullptr && + dictionary.hasAllProperties(token->getCleanValue(), nounProperty)) { + gender = super::determine(token->getValue()); + if (!gender.empty()) break; + } + } + + // If still empty, try any word token + if (gender.empty()) { + for (auto token = tokenChain->begin(); token != tokenChain->end(); ++token) { + if (dynamic_cast(token.get()) != nullptr) { + gender = super::determine(token->getValue()); + if (!gender.empty()) break; + } + } + } + + // If still empty, fallback to suffix heuristics on the second token in chain + if (gender.empty()) { + auto head = tokenChain->getHead(); + if (head != nullptr) { + auto token = npc(head)->getNext(); + if (token != nullptr) { + const auto& stringToken = npc(token)->getCleanValue(); + + for (const auto& suffix : MASCULINE_SUFFIXES()) { + if (stringToken.ends_with(suffix)) { + gender = GrammemeConstants::GENDER_MASCULINE(); + break; + } + } + + if (gender.empty()) { + for (const auto& suffix : FEMININE_SUFFIXES()) { + if (stringToken.ends_with(suffix)) { + gender = GrammemeConstants::GENDER_FEMININE(); + break; + } + } + } + + if (gender.empty()) { + for (const auto& suffix : NEUTER_SUFFIXES()) { + if (stringToken.ends_with(suffix)) { + gender = GrammemeConstants::GENDER_NEUTER(); + break; + } + } + } + } + } + } + } + + if (gender.empty()) { + // Default to masculine if no gender is detected + gender = GrammemeConstants::GENDER_MASCULINE(); + } + return gender; +} + +} // namespace inflection::grammar::synthesis + diff --git a/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_GenderLookupFunction.hpp b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_GenderLookupFunction.hpp new file mode 100644 index 00000000..e9eca1f7 --- /dev/null +++ b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_GenderLookupFunction.hpp @@ -0,0 +1,28 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + */ +#pragma once + +#include +#include +#include +#include + +class inflection::grammar::synthesis::MlGrammarSynthesizer_GenderLookupFunction + : public ::inflection::dialog::DictionaryLookupFunction +{ +public: + typedef ::inflection::dialog::DictionaryLookupFunction super; + +private: + const ::std::unique_ptr<::inflection::tokenizer::Tokenizer> tokenizer; + const ::inflection::dictionary::DictionaryMetaData& dictionary; + int64_t nounProperty { }; +public: + ::std::u16string determine(const ::std::u16string& word) const override; + + explicit MlGrammarSynthesizer_GenderLookupFunction(); + ~MlGrammarSynthesizer_GenderLookupFunction() override; + MlGrammarSynthesizer_GenderLookupFunction(const MlGrammarSynthesizer_GenderLookupFunction&) = delete; + MlGrammarSynthesizer_GenderLookupFunction& operator=(const MlGrammarSynthesizer_GenderLookupFunction&) = delete; +}; diff --git a/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_MlDisplayFunction.cpp b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_MlDisplayFunction.cpp new file mode 100644 index 00000000..d896bb12 --- /dev/null +++ b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_MlDisplayFunction.cpp @@ -0,0 +1,283 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace inflection::grammar::synthesis { +using dialog::SemanticFeature; +using dialog::SemanticFeatureModel_DisplayData; +using dialog::DisplayValue; + +static constexpr auto FORMALITY_FORMAL = u"formal"; +static constexpr auto FORMALITY_INFORMAL = u"informal"; +static constexpr auto CLUSIVITY_INCLUSIVE = u"inclusive"; +static constexpr auto CLUSIVITY_EXCLUSIVE = u"exclusive"; + +MlGrammarSynthesizer_MlDisplayFunction::MlGrammarSynthesizer_MlDisplayFunction( + const ::inflection::dialog::SemanticFeatureModel &model) + : caseFeature(*npc(model.getFeature(GrammemeConstants::CASE))), + numberFeature(*npc(model.getFeature(GrammemeConstants::NUMBER))), + genderFeature(*npc(model.getFeature(GrammemeConstants::GENDER))), + posFeature(*npc(model.getFeature(GrammemeConstants::POS))), + dictionaryInflector( + util::LocaleUtils::MALAYALAM(), + { + {GrammemeConstants::POS_NOUN(), GrammemeConstants::POS_VERB(), GrammemeConstants::POS_PRONOUN()}, + { + GrammemeConstants::CASE_NOMINATIVE(), GrammemeConstants::CASE_ACCUSATIVE(), + GrammemeConstants::CASE_DATIVE(), + GrammemeConstants::CASE_GENITIVE(), GrammemeConstants::CASE_LOCATIVE(), + GrammemeConstants::CASE_INSTRUMENTAL(), + GrammemeConstants::CASE_SOCIATIVE() + }, + {GrammemeConstants::NUMBER_SINGULAR(), GrammemeConstants::NUMBER_PLURAL()}, + { + GrammemeConstants::GENDER_MASCULINE(), GrammemeConstants::GENDER_FEMININE(), + GrammemeConstants::GENDER_NEUTER() + }, + {FORMALITY_FORMAL, FORMALITY_INFORMAL}, + {CLUSIVITY_INCLUSIVE, CLUSIVITY_EXCLUSIVE}, + { + GrammemeConstants::PERSON_FIRST(), GrammemeConstants::PERSON_SECOND(), + GrammemeConstants::PERSON_THIRD() + }, + {GrammemeConstants::TENSE_PAST(), GrammemeConstants::TENSE_PRESENT(), GrammemeConstants::TENSE_FUTURE()}, + { + GrammemeConstants::MOOD_INDICATIVE(), GrammemeConstants::MOOD_IMPERATIVE(), GrammemeConstants::MOOD_SUBJUNCTIVE() + } // local constant + }, + {}, + true) + , tokenizer(npc(inflection::tokenizer::TokenizerFactory::createTokenizer(util::LocaleUtils::MALAYALAM()))) + , malayalamInflectableChars(::inflection::lang::StringFilterUtil::MALAYALAM_SCRIPT()) + , nonMalayalamChars(u"[\\p{Latin}\\p{Nd}\\p{Punct}]") +{ + inflection::util::UnicodeSetUtils::freeze(&nonMalayalamChars); +} + +MlGrammarSynthesizer_MlDisplayFunction::~MlGrammarSynthesizer_MlDisplayFunction() = default; + +std::vector MlGrammarSynthesizer_MlDisplayFunction::buildConstraintVector( + const std::map &constraints) const +{ + std::vector vals; + auto addIfNotEmpty = [&](const SemanticFeature &f) { + const auto v = GrammarSynthesizerUtil::getFeatureValue(constraints, f); + if (!v.empty()) { + vals.push_back(v); + } + }; + + addIfNotEmpty(caseFeature); + addIfNotEmpty(numberFeature); + addIfNotEmpty(genderFeature); + + const auto posVal = GrammarSynthesizerUtil::getFeatureValue(constraints, posFeature); + if (!posVal.empty()) { + vals.push_back(posVal); + } + + return vals; +} + +// Fallback noun/verb inflection unchanged; subjunctive handled via constraintValues +std::optional MlGrammarSynthesizer_MlDisplayFunction::guessFallbackNounInflection( + const std::u16string &phrase, + const std::vector &constraintValues) const +{ + std::unique_ptr tokenChain(tokenizer->createTokenChain(phrase)); + if (!tokenChain || tokenChain->getWordCount() == 0) { + return std::nullopt; + } + + const inflection::tokenizer::Token *lastSignificantToken = nullptr; + int64_t lastTokenGrammemes = 0; + + for (auto &token: *tokenChain) { + if (!token.isSignificant()) { + continue; + } + + int64_t combinedType = 0; + dictionaryInflector.getDictionary().getCombinedBinaryType(&combinedType, token.getValue()); + + if (inflection::util::UnicodeSetUtils::containsSome(malayalamInflectableChars, token.getValue())) { + lastSignificantToken = &token; + lastTokenGrammemes = combinedType; + } + } + + if (!lastSignificantToken) { + return phrase; + } + + std::u16string result; + for (auto &token: *tokenChain) { + std::u16string tokenVal = token.getValue(); + + if (token.isSignificant() && &token == lastSignificantToken) { + auto inflected = dictionaryInflector.inflect(tokenVal, lastTokenGrammemes, constraintValues); + if (!inflected.has_value()) { + MlGrammarSynthesizer::Number num = MlGrammarSynthesizer::Number::undefined; + MlGrammarSynthesizer::Case caseVal = MlGrammarSynthesizer::Case::undefined; + for (const auto &val: constraintValues) { + if (num == MlGrammarSynthesizer::Number::undefined) { + num = MlGrammarSynthesizer::getNumber(val); + } + if (caseVal == MlGrammarSynthesizer::Case::undefined) { + caseVal = MlGrammarSynthesizer::getCase(val); + } + } + + if (caseVal == MlGrammarSynthesizer::Case::accusative && !tokenVal.ends_with(u"കൾ")) { + auto key = MlGrammarSynthesizer::makeLookupKey(num, caseVal); + tokenVal += MlGrammarSynthesizer::getSuffix(key); + } + } + else { + tokenVal = *inflected; + } + } + + result += tokenVal; + } + + return result; +} + +static std::optional guessFallbackVerbInflection( + const std::u16string &token, + const std::vector &constraintValues) +{ + auto key = MlGrammarSynthesizer::buildVerbSuffixKey(constraintValues); + return token + std::u16string(MlGrammarSynthesizer::getVerbSuffix(key)); +} + +std::u16string MlGrammarSynthesizer_MlDisplayFunction::inflectPhrase( + const std::u16string &phrase, + const std::vector &constraintValues, + bool enableInflectionGuess) const +{ + std::unique_ptr tokenChain(tokenizer->createTokenChain(phrase)); + if (!tokenChain || tokenChain->getWordCount() == 0) { + return phrase; + } + + std::u16string posVal; + for (const auto &val: constraintValues) { + if (val == GrammemeConstants::POS_NOUN() || + val == GrammemeConstants::POS_PRONOUN() || + val == GrammemeConstants::POS_VERB()) + { + posVal = val; + break; + } + } + + const inflection::tokenizer::Token *lastSignificantToken = nullptr; + for (const auto &token: *tokenChain) { + if (token.isSignificant()) { + lastSignificantToken = &token; + } + } + + std::u16string result; + for (const auto &token: *tokenChain) { + if (!result.empty() && token.isSignificant()) { + result += u" "; + } + std::u16string tokenVal = token.getValue(); + + if (&token == lastSignificantToken) { + int64_t lastTokenGrammemes = 0; + dictionaryInflector.getDictionary().getCombinedBinaryType(&lastTokenGrammemes, tokenVal); + + auto inflectedOpt = dictionaryInflector.inflect(tokenVal, lastTokenGrammemes, constraintValues); + + if (!inflectedOpt.has_value() && enableInflectionGuess) { + if (posVal == GrammemeConstants::POS_NOUN() || posVal == GrammemeConstants::POS_PRONOUN()) { + std::u16string fullPhrase; + for (const auto &t: *tokenChain) { + fullPhrase += t.getValue(); + } + inflectedOpt = guessFallbackNounInflection(fullPhrase, constraintValues); + } + else if (posVal == GrammemeConstants::POS_VERB()) { + inflectedOpt = guessFallbackVerbInflection(tokenVal, constraintValues); + } + } + + if (inflectedOpt.has_value()) { + tokenVal = *inflectedOpt; + } + else { + MlGrammarSynthesizer::Number num = MlGrammarSynthesizer::Number::undefined; + MlGrammarSynthesizer::Case caseVal = MlGrammarSynthesizer::Case::undefined; + for (const auto &val: constraintValues) { + if (num == MlGrammarSynthesizer::Number::undefined) { + num = MlGrammarSynthesizer::getNumber(val); + } + if (caseVal == MlGrammarSynthesizer::Case::undefined) { + caseVal = MlGrammarSynthesizer::getCase(val); + } + } + + if (caseVal == MlGrammarSynthesizer::Case::accusative && !tokenVal.ends_with(u"കൾ")) { + auto key = MlGrammarSynthesizer::makeLookupKey(num, caseVal); + tokenVal += MlGrammarSynthesizer::getSuffix(key); + } + } + } + + result += tokenVal; + } + + return result; +} + +::inflection::dialog::DisplayValue *MlGrammarSynthesizer_MlDisplayFunction::getDisplayValue( + const ::inflection::dialog::SemanticFeatureModel_DisplayData &displayData, + const std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, + bool enableInflectionGuess) const +{ + const auto displayValue = GrammarSynthesizerUtil::getTheBestDisplayValue(displayData, constraints); + if (!displayValue || displayValue->getDisplayString().empty()) { + return nullptr; + } + + const std::u16string &firstDisplayValue = displayValue->getDisplayString(); + if (constraints.empty() || + !inflection::util::UnicodeSetUtils::containsSome(malayalamInflectableChars, firstDisplayValue) || + inflection::util::UnicodeSetUtils::containsSome(nonMalayalamChars, firstDisplayValue)) + { + return new DisplayValue(firstDisplayValue, constraints); + } + + std::vector constraintValues = buildConstraintVector(constraints); + + std::u16string inflected = inflectPhrase(firstDisplayValue, constraintValues, enableInflectionGuess); + if (!inflected.empty() && inflected != firstDisplayValue) { + return new DisplayValue(inflected, constraints); + } + + return nullptr; +} + +} // namespace inflection::grammar::synthesis diff --git a/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_MlDisplayFunction.hpp b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_MlDisplayFunction.hpp new file mode 100644 index 00000000..feffe09c --- /dev/null +++ b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_MlDisplayFunction.hpp @@ -0,0 +1,54 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +class inflection::grammar::synthesis::MlGrammarSynthesizer_MlDisplayFunction + : public virtual ::inflection::dialog::DefaultDisplayFunction +{ +public: + using super = ::inflection::dialog::DefaultDisplayFunction; + +private: + const ::inflection::dialog::SemanticFeature& caseFeature; + const ::inflection::dialog::SemanticFeature& numberFeature; + const ::inflection::dialog::SemanticFeature& genderFeature; + const ::inflection::dialog::SemanticFeature& posFeature; + ::inflection::dialog::DictionaryLookupInflector dictionaryInflector; + std::unique_ptr tokenizer; + const icu4cxx::UnicodeSet& malayalamInflectableChars; + icu4cxx::UnicodeSet nonMalayalamChars; + +private: + std::vector buildConstraintVector(const std::map &constraints) const; + std::optional guessFallbackNounInflection(const std::u16string &phrase, const std::vector &constraintValues) const; + +public: + ::inflection::dialog::DisplayValue* getDisplayValue( + const ::inflection::dialog::SemanticFeatureModel_DisplayData& displayData, + const std::map<::inflection::dialog::SemanticFeature, std::u16string>& constraints, + bool enableInflectionGuess) const override; + + ::std::u16string inflectPhrase( + const ::std::u16string& phrase, + const ::std::vector<::std::u16string>& constraintValues, + bool enableInflectionGuess) const; + +public: + explicit MlGrammarSynthesizer_MlDisplayFunction(const ::inflection::dialog::SemanticFeatureModel& model); + ~MlGrammarSynthesizer_MlDisplayFunction() override; + MlGrammarSynthesizer_MlDisplayFunction(MlGrammarSynthesizer_MlDisplayFunction&) = delete; + MlGrammarSynthesizer_MlDisplayFunction& operator=(MlGrammarSynthesizer_MlDisplayFunction&) = delete; + +private: + friend class MlGrammarSynthesizer; +}; diff --git a/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_NumberLookupFunction.cpp b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_NumberLookupFunction.cpp new file mode 100644 index 00000000..dac217aa --- /dev/null +++ b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_NumberLookupFunction.cpp @@ -0,0 +1,67 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace inflection::grammar::synthesis { + +MlGrammarSynthesizer_NumberLookupFunction::MlGrammarSynthesizer_NumberLookupFunction() + : super(::inflection::util::LocaleUtils::MALAYALAM(), + {GrammemeConstants::NUMBER_SINGULAR(), GrammemeConstants::NUMBER_PLURAL()}, + {GrammemeConstants::POS_NOUN(), GrammemeConstants::POS_VERB()}) + , tokenizer(npc(::inflection::tokenizer::TokenizerFactory::createTokenizer(::inflection::util::LocaleUtils::MALAYALAM()))) + , dictionary(getDictionary()) +{ + ::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&nounProperty, {u"noun"})); +} + +MlGrammarSynthesizer_NumberLookupFunction::~MlGrammarSynthesizer_NumberLookupFunction() +{ +} + +::std::u16string MlGrammarSynthesizer_NumberLookupFunction::determine(const ::std::u16string& word) const +{ + if (word.empty()) { + return {}; + } + + auto out = super::determine(word); + if (!out.empty()) { + return out; + } + + std::unique_ptr<::inflection::tokenizer::TokenChain> tokenChain(npc(tokenizer->createTokenChain(word))); + + for (const auto& token : *tokenChain) { + if (dynamic_cast(&token) != nullptr) { + if (dictionary.hasAllProperties(token.getCleanValue(), nounProperty)) { + out = super::determine(token.getValue()); + if (!out.empty()) { + return out; + } + } + } + } + + // plural suffix detection + + const auto& lastToken = npc(npc(tokenChain->getEnd())->getPrevious())->getValue(); + for (const auto& suffix : {u"കൾ", u"ങ്ങൾ", u"മാർ", u"വർ", u"കളുടെ", u"ങ്ങൾക്ക്"}) { + if (lastToken.ends_with(suffix)) { + return GrammemeConstants::NUMBER_PLURAL(); + } + } + + return GrammemeConstants::NUMBER_SINGULAR(); +} + +} // namespace inflection::grammar::synthesis diff --git a/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_NumberLookupFunction.hpp b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_NumberLookupFunction.hpp new file mode 100644 index 00000000..bc3675cb --- /dev/null +++ b/inflection/src/inflection/grammar/synthesis/MlGrammarSynthesizer_NumberLookupFunction.hpp @@ -0,0 +1,29 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + */ +#pragma once + +#include +#include +#include + +class inflection::grammar::synthesis::MlGrammarSynthesizer_NumberLookupFunction + : public ::inflection::dialog::DictionaryLookupFunction +{ +public: + typedef ::inflection::dialog::DictionaryLookupFunction super; + +private: + const ::std::unique_ptr<::inflection::tokenizer::Tokenizer> tokenizer; + const ::inflection::dictionary::DictionaryMetaData& dictionary; + int64_t nounProperty {}; + +public: + ::std::u16string determine(const ::std::u16string& word) const override; + + MlGrammarSynthesizer_NumberLookupFunction(); + ~MlGrammarSynthesizer_NumberLookupFunction() override; + MlGrammarSynthesizer_NumberLookupFunction(const MlGrammarSynthesizer_NumberLookupFunction&) = delete; + MlGrammarSynthesizer_NumberLookupFunction& operator=(const MlGrammarSynthesizer_NumberLookupFunction&) = delete; +}; + diff --git a/inflection/src/inflection/grammar/synthesis/SrGrammarSynthesizer_SrDisplayFunction.cpp b/inflection/src/inflection/grammar/synthesis/SrGrammarSynthesizer_SrDisplayFunction.cpp index c46dd1f1..e6ac3bdb 100644 --- a/inflection/src/inflection/grammar/synthesis/SrGrammarSynthesizer_SrDisplayFunction.cpp +++ b/inflection/src/inflection/grammar/synthesis/SrGrammarSynthesizer_SrDisplayFunction.cpp @@ -11,14 +11,18 @@ #include #include #include +#include #include #include #include #include +#include #include #include +#include #include #include +#include namespace inflection::grammar::synthesis { @@ -42,7 +46,7 @@ SrGrammarSynthesizer_SrDisplayFunction::~SrGrammarSynthesizer_SrDisplayFunction( { } -::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const +::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectFromDictionary(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const { ::std::u16string countString(GrammarSynthesizerUtil::getFeatureValue(constraints, numberFeature)); ::std::u16string caseString(GrammarSynthesizerUtil::getFeatureValue(constraints, caseFeature)); @@ -61,7 +65,6 @@ ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::s if (!genderString.empty()) { string_constraints.emplace_back(genderString); } - // The nominative/caseless is unmarked in the patterns, so we need to do something like this int64_t wordGrammemes = 0; dictionary.getCombinedBinaryType(&wordGrammemes, lemma); @@ -77,7 +80,66 @@ ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::s return inflection; } -::inflection::dialog::DisplayValue * SrGrammarSynthesizer_SrDisplayFunction::getDisplayValue(const dialog::SemanticFeatureModel_DisplayData &displayData, const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, bool /* enableInflectionGuess */) const +namespace { + +// Rule based inflectors for four declination groups. +// Masculine or neuter ending in o or e and masculine ending with consonant. +::std::u16string inflectByRuleOE(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender); +// Neuter ending in e +::std::u16string inflectByRuleE(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender); +// All genders ending in a +::std::u16string inflectByRuleA(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase); +// Feminine, ending with consonant +::std::u16string inflectByRuleConsonant(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender); + +// Number of cases in Serbian. +static constexpr auto NUMBER_OF_CASES = 7UL; + +// Given the table of all suffixes, both for singular and plural, append suffix to lemma, matching the number and case. +::std::u16string applySuffix(const ::std::u16string&, const ::std::array<::std::u16string_view, NUMBER_OF_CASES>&, const ::std::array<::std::u16string_view, NUMBER_OF_CASES>&, const ::std::u16string&, const ::std::u16string&); + +// Check if proper noun by checking the first character is capital letter. +bool isProperNoun(const ::std::u16string &lemma); + +} // namespace + +::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectWithRule(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const +{ + ::std::u16string countString(GrammarSynthesizerUtil::getFeatureValue(constraints, numberFeature)); + ::std::u16string caseString(GrammarSynthesizerUtil::getFeatureValue(constraints, caseFeature)); + auto genderString = GrammarSynthesizerUtil::getFeatureValue(constraints, genderFeature); + + ::std::u16string inflection; + + // If one of singular/plural, case and gender are not specified return lemma. + if (countString.empty() || caseString.empty() || genderString.empty()) { + return lemma; + } + + // Do nothing for singular, nominative. + if (countString == GrammemeConstants::NUMBER_SINGULAR() && caseString == GrammemeConstants::CASE_NOMINATIVE()) { + return lemma; + } + + // These are four declention groups in the language. + if ((lemma.ends_with(u'о') || lemma.ends_with(u'е')) && (genderString == GrammemeConstants::GENDER_MASCULINE() || genderString == GrammemeConstants::GENDER_NEUTER())) { + inflection = inflectByRuleOE(lemma, countString, caseString, genderString); + } else if (lemma.ends_with(u'е') && genderString == GrammemeConstants::GENDER_NEUTER()) { + inflection = inflectByRuleE(lemma, countString, caseString, genderString); + } else if (lemma.ends_with(u'а')) { + inflection = inflectByRuleA(lemma, countString, caseString); + } else { + inflection = inflectByRuleConsonant(lemma, countString, caseString, genderString); + } + + if (inflection.empty()) { + inflection = lemma; + } + + return inflection; +} + +::inflection::dialog::DisplayValue *SrGrammarSynthesizer_SrDisplayFunction::getDisplayValue(const dialog::SemanticFeatureModel_DisplayData &displayData, const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, bool enableInflectionGuess) const { ::std::u16string displayString; if (!displayData.getValues().empty()) { @@ -87,9 +149,154 @@ ::inflection::dialog::DisplayValue * SrGrammarSynthesizer_SrDisplayFunction::get return nullptr; } if (dictionary.isKnownWord(displayString)) { - displayString = inflectString(constraints, displayString); + displayString = inflectFromDictionary(constraints, displayString); + } else if (enableInflectionGuess) { + // Let's use rule based inflection for nouns. Assume lemma is singular, nominative. + displayString = inflectWithRule(constraints, displayString); } return new ::inflection::dialog::DisplayValue(displayString, constraints); } +namespace { + +static bool isConsonant(char16_t ch) { + return ::inflection::lang::StringFilterUtil::CYRILLIC_SCRIPT().contains(ch) && !::inflection::dictionary::PhraseProperties::DEFAULT_VOWELS_START().contains(ch); +} + +static bool isVowel(char16_t ch) { + return ::inflection::lang::StringFilterUtil::CYRILLIC_SCRIPT().contains(ch) && ::inflection::dictionary::PhraseProperties::DEFAULT_VOWELS_START().contains(ch); +} + +// Some rules require number of syllables in the word. It's counted as all vowels plus r if in between consonants, or if it starts a word followed by a consonant. +// We care about 1, 2 and more than 2 cases. +enum class Syllables { + ONE_SYLLABLE, + TWO_SYLLABLES, + MULTI_SYLLABLES, +}; +Syllables countSyllables(const ::std::u16string& lemma) { + uint16_t total = 0; + size_t index = 0; + const size_t length = lemma.length(); + for (const char16_t ch: lemma) { + if (isVowel(ch)) { + ++total; + } + // Check case where R is at the begining followed by a consonant. + if ((ch == u'р' || ch == u'Р') && (index == 0 && index + 1 < length)) { + if (isConsonant(lemma[index + 1])) { + ++total; + } + } else if ((ch == u'р' || ch == u'Р') && (index != 0 && index + 1 < length)) { + if (isConsonant(lemma[index - 1]) && isConsonant(lemma[index + 1])) { + ++total; + } + } + ++index; + } + + if (total == 1) { + return Syllables::ONE_SYLLABLE; + } else if (total == 2) { + return Syllables::TWO_SYLLABLES; + } else { + return Syllables::MULTI_SYLLABLES; + } +} + +::std::u16string inflectByRuleOE(const ::std::u16string &lemma, [[maybe_unused]] const ::std::u16string &number, [[maybe_unused]] const ::std::u16string &targetCase, [[maybe_unused]] const ::std::u16string &gender) +{ + // TODO(nciric): implement logic. + return lemma; +} + +::std::u16string inflectByRuleE(const ::std::u16string &lemma, [[maybe_unused]] const ::std::u16string &number, [[maybe_unused]] const ::std::u16string &targetCase, [[maybe_unused]] const ::std::u16string &gender) +{ + // TODO(nciric): implement logic. + return lemma; +} + +::std::u16string inflectByRuleA(const ::std::u16string &lemma, const ::std::u16string &number, const ::std::u16string &targetCase) +{ + static constexpr auto suffix_sg = ::std::to_array<::std::u16string_view>({u"а", u"е", u"и", u"у", u"а", u"ом", u"и"}); + static constexpr auto suffix_pl = ::std::to_array<::std::u16string_view>({u"е", u"а", u"ама", u"е", u"е", u"ама", u"ама"}); + + ::std::u16string base = lemma; + // Remove trailing a and apply suffix. + base.pop_back(); + base = applySuffix(base, suffix_sg, suffix_pl, number, targetCase); + + // Vocative singular and genitive plural require special processing in some cases. + if (number == GrammemeConstants::NUMBER_SINGULAR() && targetCase == GrammemeConstants::CASE_VOCATIVE()) { + Syllables syllables = countSyllables(lemma); + if (lemma.ends_with(u"ица") && syllables == Syllables::MULTI_SYLLABLES) { + base.back() = u'е'; + } + if (isProperNoun(lemma) && syllables == Syllables::TWO_SYLLABLES) { + base.back() = u'о'; + } + } + + if (number == GrammemeConstants::NUMBER_PLURAL() && targetCase == GrammemeConstants::CASE_GENITIVE()) { + if (lemma.ends_with(u"тња") || lemma.ends_with(u"дња") || lemma.ends_with(u"пта") || lemma.ends_with(u"лба") || lemma.ends_with(u"рва")) { + base.back() = u'и'; + } + static const char16_t *mappings[][2] = { + {u"јка", u"јака"}, + {u"мља", u"маља"}, + {u"вца", u"ваца"}, + {u"тка", u"така"}, + {u"пка", u"пака"}, + }; + for (const auto &[suffix, replacement] : mappings) { + if (base.ends_with(suffix)) { + auto suffix_length = std::u16string_view(suffix).length(); + base.replace(base.length() - suffix_length, suffix_length, replacement); + } + } + } + + return base; +} + +::std::u16string inflectByRuleConsonant(const ::std::u16string &lemma, [[maybe_unused]] const ::std::u16string &number, [[maybe_unused]] const ::std::u16string &targetCase, [[maybe_unused]] const ::std::u16string & gender) +{ + // TODO(nciric): implement logic. + return lemma; +} + +::std::u16string applySuffix(const ::std::u16string &lemma, const ::std::array<::std::u16string_view, NUMBER_OF_CASES>& suffix_sg, const ::std::array<::std::u16string_view, NUMBER_OF_CASES>& suffix_pl, + const ::std::u16string &number, const ::std::u16string &targetCase) +{ + const ::std::map<::std::u16string, size_t> case_index = { + {GrammemeConstants::CASE_NOMINATIVE(), 0}, + {GrammemeConstants::CASE_GENITIVE(), 1}, + {GrammemeConstants::CASE_DATIVE(), 2}, + {GrammemeConstants::CASE_ACCUSATIVE(), 3}, + {GrammemeConstants::CASE_VOCATIVE(), 4}, + {GrammemeConstants::CASE_INSTRUMENTAL(), 5}, + {GrammemeConstants::CASE_LOCATIVE(), 6} + }; + + auto index = case_index.at(targetCase); + + if (number == GrammemeConstants::NUMBER_SINGULAR()) { + return lemma + ::std::u16string(suffix_sg[index]); + } else { + return lemma + ::std::u16string(suffix_pl[index]); + } +} + +bool isProperNoun(const ::std::u16string &lemma) { + // Check if first character is in range of Cyrl capital letters. + auto first_ch = lemma.front(); + if (0x402 <= first_ch && first_ch <= 0x428) { + return true; + } + + return false; +} + +} // namespace + } // namespace inflection::grammar::synthesis diff --git a/inflection/src/inflection/grammar/synthesis/SrGrammarSynthesizer_SrDisplayFunction.hpp b/inflection/src/inflection/grammar/synthesis/SrGrammarSynthesizer_SrDisplayFunction.hpp index 8e5d66b1..c8ae8f9f 100644 --- a/inflection/src/inflection/grammar/synthesis/SrGrammarSynthesizer_SrDisplayFunction.hpp +++ b/inflection/src/inflection/grammar/synthesis/SrGrammarSynthesizer_SrDisplayFunction.hpp @@ -30,7 +30,8 @@ class inflection::grammar::synthesis::SrGrammarSynthesizer_SrDisplayFunction SrGrammarSynthesizer_SrDisplayFunction& operator=(const SrGrammarSynthesizer_SrDisplayFunction&) = delete; private: - ::std::u16string inflectString(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const; + ::std::u16string inflectFromDictionary(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const; + ::std::u16string inflectWithRule(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const; const ::inflection::dictionary::DictionaryMetaData& dictionary; const ::inflection::dialog::SemanticFeature& caseFeature; diff --git a/inflection/src/inflection/grammar/synthesis/fwd.hpp b/inflection/src/inflection/grammar/synthesis/fwd.hpp index 0693277e..1e02f219 100644 --- a/inflection/src/inflection/grammar/synthesis/fwd.hpp +++ b/inflection/src/inflection/grammar/synthesis/fwd.hpp @@ -69,6 +69,11 @@ namespace inflection class NbGrammarSynthesizer; class NbGrammarSynthesizer_ArticleLookupFunction; class NbGrammarSynthesizer_NbDisplayFunction; + class MlGrammarSynthesizer; + class MlGrammarSynthesizer_NumberLookupFunction; + class MlGrammarSynthesizer_GenderLookupFunction; + class MlGrammarSynthesizer_CaseLookupFunction; + class MlGrammarSynthesizer_MlDisplayFunction; class NlGrammarSynthesizer; class NlGrammarSynthesizer_ArticleLookupFunction; class NlGrammarSynthesizer_DefinitenessLookupFunction; diff --git a/inflection/src/inflection/lang/StringFilterUtil.cpp b/inflection/src/inflection/lang/StringFilterUtil.cpp index dd095729..f9942dff 100644 --- a/inflection/src/inflection/lang/StringFilterUtil.cpp +++ b/inflection/src/inflection/lang/StringFilterUtil.cpp @@ -41,6 +41,13 @@ const icu4cxx::UnicodeSet& StringFilterUtil::HEBREW_SCRIPT() return *npc(HEBREW_SCRIPT_); } +const icu4cxx::UnicodeSet& StringFilterUtil::MALAYALAM_SCRIPT() +{ + static auto MALAYALAM_SCRIPT_ = + ::inflection::util::UnicodeSetUtils::freeze(new ::icu4cxx::UnicodeSet(u"[:Malayalam:]")); + return *npc(MALAYALAM_SCRIPT_); +} + const icu4cxx::UnicodeSet& StringFilterUtil::HAN_SCRIPT() { static auto HAN_SCRIPT_ = ::inflection::util::UnicodeSetUtils::freeze(new ::icu4cxx::UnicodeSet(u"[:Han:]")); diff --git a/inflection/src/inflection/lang/StringFilterUtil.hpp b/inflection/src/inflection/lang/StringFilterUtil.hpp index 10a32b85..d266bbae 100644 --- a/inflection/src/inflection/lang/StringFilterUtil.hpp +++ b/inflection/src/inflection/lang/StringFilterUtil.hpp @@ -69,6 +69,10 @@ class INFLECTION_INTERNAL_API inflection::lang::StringFilterUtil final * A set of all characters in the Hebrew script. */ static const ::icu4cxx::UnicodeSet& HEBREW_SCRIPT(); + /** + * A set of all characters in the Malayalam script. + */ + static const ::icu4cxx::UnicodeSet& MALAYALAM_SCRIPT(); /** * A set of all characters in the Han script. The Han script is unified between Chinese, Japanese and Korean. */ diff --git a/inflection/src/inflection/util/LocaleUtils.cpp b/inflection/src/inflection/util/LocaleUtils.cpp index 0a5cdc9f..8238bec9 100644 --- a/inflection/src/inflection/util/LocaleUtils.cpp +++ b/inflection/src/inflection/util/LocaleUtils.cpp @@ -407,6 +407,18 @@ const ULocale& LocaleUtils::MALAYSIA() return *npc(MALAYSIA_); } +const ULocale& LocaleUtils::MALAYALAM() +{ + static auto MALAYALAM_ = new ULocale("ml"); + return *npc(MALAYALAM_); +} + +const ULocale& LocaleUtils::INDIA_MALAYALAM() +{ + static auto INDIA_MALAYALAM_ = new ULocale("ml", "IN"); + return *npc(INDIA_MALAYALAM_); +} + const ULocale& LocaleUtils::NORWEGIAN() { static auto NORWEGIAN_ = new ULocale("nb"); diff --git a/inflection/src/inflection/util/LocaleUtils.hpp b/inflection/src/inflection/util/LocaleUtils.hpp index e5fa8582..ac4ec784 100644 --- a/inflection/src/inflection/util/LocaleUtils.hpp +++ b/inflection/src/inflection/util/LocaleUtils.hpp @@ -376,6 +376,14 @@ class INFLECTION_CLASS_API inflection::util::LocaleUtils final * ms_MY: Malay (Malaysia) */ static const ::inflection::util::ULocale& MALAYSIA(); + /** + * ml: Malayalam + */ + static const ::inflection::util::ULocale& MALAYALAM(); + /** + * ml_IN: Malayalam (India) + */ + static const ::inflection::util::ULocale& INDIA_MALAYALAM(); /** * nb: Norwegian Bokmål */ diff --git a/inflection/test/resources/inflection/dialog/inflection/ml.xml b/inflection/test/resources/inflection/dialog/inflection/ml.xml new file mode 100644 index 00000000..3fff089c --- /dev/null +++ b/inflection/test/resources/inflection/dialog/inflection/ml.xml @@ -0,0 +1,68 @@ + + + + + + മരംമരങ്ങൾ + കഥകഥകൾ + + + അട്ടുകഅട്ടുക + അട്ടുകഅട്ടുക + + + പക്ഷിപക്ഷി + പക്ഷിപക്ഷിയെ + പക്ഷിപക്ഷിക്കു് + പക്ഷിപക്ഷിയുടെ + പക്ഷിപക്ഷിയിൽ + പക്ഷിപക്ഷിയാൽ + + + മരം + മരങ്ങൾ + + + വളപ്പുറത്തെ ലൈറ്റ് + വളപ്പുറത്തെ ലൈറ്റുകൾ + തോട്ടത്തിലെ ലൈറ്റുകൾ + + + അട്ടുകഅട്ടുക + അട്ടുകഅട്ടുക + അട്ടുകഅട്ടുക + + + പോകുകപോകുക + പോകുകപോകുക + അട്ടുകഅട്ടുക + അട്ടുകഅട്ടുക + ആർക്കുകആർക്കുക + ആർക്കുകആർക്കുക + + + മീറ്റർമീറ്റർ + മീറ്റർമീറ്ററുകൾ + + + കപ്പ്കപ്പുകൾ + പൂച്ചപൂച്ചകൾ + + + ക്യാമ്പസ് ലൈറ്റ് + തോട്ടത്തിലെ ലൈറ്റുകൾ + + + അട്ടുകഅട്ടുക + + + ഇടുകഇടുക + അട്ടുകഅട്ടുക + + + അട്ടുകഅട്ടുക + പോകുകപോകുക + + diff --git a/inflection/test/resources/inflection/dialog/inflection/sr.xml b/inflection/test/resources/inflection/dialog/inflection/sr.xml index 562c7584..037c275c 100644 --- a/inflection/test/resources/inflection/dialog/inflection/sr.xml +++ b/inflection/test/resources/inflection/dialog/inflection/sr.xml @@ -18,4 +18,25 @@ + + ИталијаИталијом + авенијаавенијом + авенијаавенијама + кадијакадија + уметницауметнице + птицаптица + СтанаСтано + ЗораЗоро + БожаБожо + ЉубаЉубо + пратњапратњи + радњарадњи + лопталопти + молбамолби + конзерваконзерви + гошћагошћа + двојкадвојака + биткабитака + + diff --git a/inflection/test/resources/inflection/dialog/pronoun/ml.xml b/inflection/test/resources/inflection/dialog/pronoun/ml.xml new file mode 100644 index 00000000..ea41396f --- /dev/null +++ b/inflection/test/resources/inflection/dialog/pronoun/ml.xml @@ -0,0 +1,65 @@ + + + + + അവൻ + + + ഞാൻ + എനിക്ക് + എന്റെ + + + നാം + ഞങ്ങൾ + നമുക്ക് + ഞങ്ങൾക്ക് + നമ്മുടെ + ഞങ്ങളുടെ + + + നീ + താങ്കൾ + നിനെ + താങ്കളെ + നിന്റെ + താങ്കളുടെ + + + നിങ്ങൾ + നിങ്ങളെ + നിങ്ങൾക്ക് + നിങ്ങളുടെ + + + അവൻ + അവനെ + അവന്റെ + + + അവൾ + അവളെ + അവളുടെ + + + അത് + അതിനെ + അതിന്റെ + + + അവർ + അവരെ + അവരുടെ + + + ഞാൻഅവൻ + ഞാൻഅവൾ + ഞാൻതാങ്കൾ + + + ഞങ്ങൾ + നാം + + \ No newline at end of file diff --git a/inflection/test/resources/inflection/tokenizer/ml.xml b/inflection/test/resources/inflection/tokenizer/ml.xml new file mode 100644 index 00000000..12015dfb --- /dev/null +++ b/inflection/test/resources/inflection/tokenizer/ml.xml @@ -0,0 +1,39 @@ + + + + + + കേരളം + + + + കേരളസര്‍ക്കാര്‍ + + + + കേരളം|സര്‍ക്കാര്‍ + + + + പുസ്തകം|ഉണ്ട് + + + + വീട്|ക്ക് + + + + ശ്രീ|നാരായണ|ഗുരു + + + + കേരളബ്ലാസ്റ്റേഴ്സ് + + + + സംഗീതോത്സവം + + + diff --git a/inflection/test/src/inflection/util/LocaleUtilsTest.cpp b/inflection/test/src/inflection/util/LocaleUtilsTest.cpp index 884d558c..723843d3 100644 --- a/inflection/test/src/inflection/util/LocaleUtilsTest.cpp +++ b/inflection/test/src/inflection/util/LocaleUtilsTest.cpp @@ -96,6 +96,7 @@ TEST_CASE("LocaleUtilsTest#testCoverage") inflection::util::LocaleUtils::KOREAN(), inflection::util::LocaleUtils::LITHUANIAN(), inflection::util::LocaleUtils::MALAY(), + inflection::util::LocaleUtils::MALAYALAM(), inflection::util::LocaleUtils::NORWEGIAN(), inflection::util::LocaleUtils::DUTCH(), inflection::util::LocaleUtils::POLISH(), @@ -142,6 +143,7 @@ TEST_CASE("LocaleUtilsTest#testCoverage") inflection::util::LocaleUtils::FRANCE(), inflection::util::LocaleUtils::SWITZERLAND_FRENCH(), inflection::util::LocaleUtils::INDIA_HINDI(), + inflection::util::LocaleUtils::INDIA_MALAYALAM(), inflection::util::LocaleUtils::CROATIA(), inflection::util::LocaleUtils::ISRAEL(), inflection::util::LocaleUtils::HUNGARY(),