Module:form of/lang-data/en/functions

From Wiktionary, the free dictionary
Jump to navigation Jump to search


--[=[
This module contains lang-specific functions for English.
]=]

local require_when_needed = require("Module:require when needed")

local en_utilities_module = "Module:en-utilities"
local strutil_module = "Module:string utilities"
local table_module = "Module:table"

local is_regular_plural = require_when_needed(en_utilities_module, "is_regular_plural")
local pattern_escape = require_when_needed(strutil_module, "pattern_escape")
local split = require_when_needed(strutil_module, "split")
local umatch = require_when_needed(strutil_module, "match")

----------------------- Category functions -----------------------

-- List of umlaut plurals. Each entry is of the form {SINGULAR, PLURAL}, where any lemma ending in SINGULAR whose plural
-- ends in PLURAL are counted (hence [[dormouse]] plural [[dormice]] is counted). The entries are Lua patterns.
local umlaut_plurals = {
	-- [[mouse]] -> [[mice]], [[louse]] -> [[lice]], jocular [[house]] -> [[hice]], [[spouse]] -> [[spice]]
	{"ouse", "ice"},
	-- [[goose]] -> [[geese]], [[swoose]] -> [[sweese]], jocular [[moose]] -> [[meese]]
	{"oose", "eese"},
	{"fan", "fen"},
	{"fanne", "fenne"},
	{"pan", "pen"},
	{"foot", "feet"},
	{"foote", "feete"},
	{"booth", "beeth"},
	{"tooth", "teeth"},
	{"toof", "teef"},
	{"brother", "brethren"},
	{"Brother", "Brethren"}, -- until capitalization is fixed
	{"cow", "kye?"},
	{"cow", "kine"},
}

--[=[
The key `cat` must be specified and is the name of the category following the language name. Suffixes enclosed in
double angle brackets, e.g. <<-ata>>, are italicized (as if written e.g. {{m|en||-ata}}) in the displayed title, but
not in the category name itself. The description of the category comes from the `description` field; if omitted, it is
constructed from the category by adding "English irregular" to the beginning and appending the value of `desc_suffix`
(if given) to the end. Suffixes enclosed in double angle brackets are italicized, as described above, and template
calls are permitted.

The key `matches_plural` must be specified and is either a string or a function. If a string, the string is a Lua
pattern that should match the end of the pagename, and the remainder becomes the stem passed to `matches_lemma` (see
below). If a function, it should accept two arguments, the pagename and the lemma (or more precisely, the words in the
pagename and lemma that differ, if there are multiple words), and should return the stem of the pagename (minus the
ending) if the pagename matches the ending, otherwise nil.

The key `matches_lemma` must be specified and is either a string or a function. If a string, the string is a Lua
pattern that should match the lemma. If a function, it should accept three arguments, the pagename and lemma as in
`matches_plural`, and the stem returned by `matches_plural` or extracted from the pagename and ending. It should return
a boolean indicating whether the lemma matches.

The key `additional`, if given, is additional text to include in the category description as displayed on the page
itself, but not in the summary of the category as displayed on other pages. For further information, see the
`additional` field in [[Module:category tree/poscatboiler/data/documentation]].

The key `breadcrumb`, if given, is the breadcrumb text. See [[Module:category tree/poscatboiler/data/documentation]].
If omitted, the breadcrumb is constructed from the category name by remvoing "plurals in" from the beginning of the
category name.

The key `sort_key`, if given, specifies the sort key for the category in its parent category
[[:Category:English irregular plurals]]. By default it is derived from the breadcrumb by removing an initial hyphen.

If a plural doesn't match any of the entries, it goes into [[:Category:English miscellaneous irregular plurals]]. Note
that before checking these entries, plurals that are the same as the singular are excluded (i.e. not considered
irregular), as are plurals formed from the singular by adding [[-s]], [[-es]], [[-'s]] or [[-ses]] (if the singular ends
in '-s'; cf. [[bus]] -> 'busses', [[dis]] -> 'disses'), or by replacing final [[-y]] with [[-ies]].
]=]
local irregular_plurals = {
	{
		cat = "plurals with umlaut",
		description = "{{{langname}}} irregular noun plurals that are formed via [[umlaut]], i.e. by changing the root vowel rather than adding a suffix.",
		additional = "See also [[:Category:English plurals in -men with singular in -man|Category:English plurals in ''-men'' with singular in ''-man'']].",
		sort_key = "umlaut",
		matches_plural = function(pagename, lemma)
			for _, umlaut_plural in ipairs(umlaut_plurals) do
				local singular, plural = unpack(umlaut_plural)
				local stem = umatch(lemma, "^(.*)" .. singular .. "$")
				if stem and umatch(pagename, "^" .. pattern_escape(stem) .. plural .. "$") then
					return stem
				end
			end
			return nil
		end,
		matches_lemma = function(pagename, lemma, stem)
			-- All the work already done in matches_plural().
			return true
		end,
	},
	{
		cat = "plurals in <<-ata>> with singular in <<-a>>",
		desc_suffix = ", mostly originating from Ancient Greek neuter nouns in {{m|grc|-μᾰ}}",
		matches_plural = "ata$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "a"
		end,
	},
	{
		cat = "plurals in <<-ina>> with singular in <<-en>>",
		desc_suffix = ", mostly originating from Latin neuter nouns",
		additional = "Plurals formed by replacing a final <<-inum>> or <<-inon>> with a final <<-ina>> are found in [[:Category:English plurals in -a with singular in -um or -on|Category:English plurals in ''-a'' with singular in ''-um'' or ''-on'']].",
		matches_plural = "ina$",
		matches_lemma = "en$",
	},
	{
		cat = "plurals in <<-ra>> with singular in <<-s>>",
		desc_suffix = ", mostly originating from Latin neuter nouns",
		additional = "Sometimes the preceding vowel changes; e.g. <<-us>> commonly changes to <<-era>> or <<-ora>> in the plural. Plurals formed by replacing a final <<-rum>> or <<-ron>> with a final <<-ra>> are found in [[:Category:English plurals in -a with singular in -um or -on|Category:English plurals in ''-a'' with singular in ''-um'' or ''-on'']].",
		matches_plural = "ra$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma:find("s$")
		end,
	},	
	{
		cat = "plurals in <<-a>> with singular in <<-um>> or <<-on>>",
		desc_suffix = ", mostly originating from Latin or Greek neuter nouns",
		additional = "Plurals formed by replacing a final <<-a>> with a final <<-ata>> are found in [[:Category:English plurals in -ata with singular in -a|Category:English plurals in ''-ata'' with singular in ''-a'']].",
		matches_plural = "a$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma:find("um$") or lemma:find("on$")
		end,
	},
	{
		cat = "plurals in <<-ae>> with singular in <<-a>>",
		desc_suffix = ", mostly originating from Latin feminine nouns",
		additional = "The <<-ae>> can also be written as a ligature <<-æ>>.",
		matches_plural = function(pagename, lemma)
			return pagename:match("^(.*)ae$") or pagename:match("^(.*)æ$")
		end,
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "a"
		end,
	},
	{
		cat = "plurals in <<-e>> with singular in <<-a>>",
		desc_suffix = ", mostly originating from Italian feminine nouns",
		additional = "This category does not contain English invariant plurals ending in an <<-e>>, such as {{m|en|moose}} or {{m|en|Japanese}}, for which no letters are changed.",
		matches_plural = "e$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "a"
		end,
	},
	{
		cat = "plurals in <<-e>>",
		desc_suffix = ", mostly originating from German masculine or neuter nouns",
		additional = "These are formed by adding <<-e>>. See also [[:Category:English plurals in -e with singular in -a|Category:English plurals in ''-e'' with singular in ''-a'']].",
		matches_plural = "e$",
		matches_lemma = function(pagename, lemma, stem)
			if pagename == lemma .. "e" then
				return true
			end
		end,
	},
	{
		cat = "plurals in <<-oi>> with singular in <<-os>>",
		desc_suffix = ", mostly originating from Greek masculine nouns",
		matches_plural = "oi$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "os"
		end,
	},
	{
		cat = "plurals in <<-i>> with singular in <<-us>>, <<-os>> or <<-o>>",
		desc_suffix = ", mostly originating from Latin or Italian masculine nouns",
		additional = "Note that not all of these plurals are considered correct by all speakers.",
		matches_plural = "i$",
		matches_lemma = function(pagename, lemma, stem)
			-- don't check just for stem matching because of cases like virus -> virii that we want included.
			return lemma:find("us$") or lemma:find("os$") or lemma:find("o$")
		end,
	},
	{
		cat = "plurals in <<-i>> with singular in <<-e>>",
		desc_suffix = ", mostly originating from Italian masculine nouns",
		matches_plural = "i$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "e"
		end,
	},
	{
		cat = "plurals in <<-ia>> with singular in <<-e>>",
		desc_suffix = ", mostly originating from Latin neuter nouns",
		matches_plural = "ia$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "e"
		end,
	},
	{
		cat = "plurals in <<-ai>> with singular in <<-a>> or <<-e>>",
		desc_suffix = ", mostly originating from Greek feminine nouns",
		matches_plural = "ai$",
		matches_lemma = "[ae]$",
	},
	{
		cat = "plurals in <<-ae>> with singular in <<-e>>",
		desc_suffix = ", mostly originating from Greek feminine nouns",
		matches_plural = "ae$",
		matches_lemma = "e$",
	},
	{ -- siphon off most of the "umlaut" plurals that will otherwise end up in 'English miscellaneous irregular plurals'
		cat = "plurals in <<-men>> with singular in <<-man>>",
		desc_suffix = " (and likewise plurals in <<-women>> with singular in <<-woman>>)",
		matches_plural = "[Mm]en$", -- until capitalization is fixed
		matches_lemma = "[Mm]an$", -- until capitalization is fixed
	},
	{
		cat = "plurals in <<-children>> with singular in <<-child>>",
		matches_plural = "[Cc]hildren", -- until capitalization is fixed
		matches_lemma = "[Cc]hild$", -- until capitalization is fixed
	},
	{
		cat = "plurals in <<-en>>",
		additional = "Plurals formed by replacing a final <<-man>> with <<-men>> are found in [[:Category:English plurals in -men with singular in -man|Category:English plurals in ''-men'' with singular in ''-man'']], and those formed by replacing a final <<-child>> with <<-children>> are found in [[:Category:English plurals in -children with singular in -child|Category:English plurals in ''-children'' with singular in ''-child'']].",
		matches_plural = "en$",
		matches_lemma = function(pagename, lemma, stem)
			return not lemma:find("en$")
		end,
	},
	{
		cat = "plurals in <<-x>>",
		desc_suffix = ", mostly originating from French masculine nouns",
		additional = "Generally these are formed by adding <<-x>> to a noun ending in <<-u>>; changing final <<-al>> or <<-ail>> to <<-aux>>; or changing final <<-el>> to <<-eaux>>.",
		matches_plural = "x$",
		matches_lemma = "[lu]$",
	},
	{
		cat = "plurals in <<-im>>",
		desc_suffix = ", mostly originating from Hebrew masculine nouns",
		additional = "Generally these are formed by simply adding <<-im>>, or <<-m>> if the singular ends in <<-i>> ({{m|en|illui}} – {{m|en|illuim}}; but cf. {{m|en|goiim}}). Some changes that may occur are <<-e->> to <<-a->> or vice versa ({{m|en|heder}} – {{m|en|hadarim}}; {{m|en|gaon}} – {{m|en|geonim}}), <<-f>> to <<-v->> ({{m|en|ganef}} – {{m|en|ganevim}}), and <<-s>> to <<-t->> ({{m|en|balabos}} – {{m|en|balabatim}}).",
		matches_plural = "im$",
		matches_lemma = function(pagename, lemma, stem)
			return true
		end,
	},
	{
		cat = "plurals in <<-ot>>",
		desc_suffix = ", mostly originating from Hebrew feminine nouns",
		matches_plural = "ot$",
		matches_lemma = function(pagename, lemma, stem)
			return true
		end,
	},
	{
		cat = "plurals in <<-oth>>",
		desc_suffix = ", mostly originating from Hebrew feminine nouns",
		matches_plural = "oth$",
		matches_lemma = function(pagename, lemma, stem)
			return true
		end,
	},
	{
		cat = "plurals in <<-y>>",
		desc_suffix = ", mostly originating from Polish feminine nouns, Russian or Ukrainian masculine/feminine nouns, or Czech masculine nouns",
		additional = "These are formed by adding <<y>>, or changing <<-a>> to <<-y>>. The last vowel may be removed ({{m|en|khokhol}} – {{m|en|khokhly}}; {{m|en|strelets}} – {{m|en|streltsy}}).",
		matches_plural = "y$",
		matches_lemma = function(pagename, lemma, stem)
			-- We don't want [[ky]] (plural of [[cow]]) and [[passersby]] (plural of [[passerby]]) to be included, so we check
			-- for the expected situations:
			-- (1) Simple addition of -y (e.g. [[kolkhoz]] -> [[kolkhozy]]) or changing -a to -y (e.g. [[tselina]] -> [[tseliny]]).
			if pagename == lemma .. "y" or pagename == lemma:gsub("a$", "") .. "y" then
				return true
			end
			-- (2) Reduction of lemma before -y by deleting the last vowel (e.g. [[strelets]] -> [[streltsy]]).
			local reduced_lemma = lemma:gsub("[aeiou]([^aeiou]+)$", "%1")
			return pagename == reduced_lemma .. "y"
		end,
	},
	{
		cat = "plurals in <<-i>>",
		desc_suffix = ", mostly originating from Russian masculine/feminine nouns or Italian masculine nouns",
		additional = "These are formed by adding <<i>>, or changing <<-a>> to <<-i>>. See also [[:Category:English plurals in -i with singular in -e|Category:English plurals in ''-i'' with singular in ''-e'']] and [[:Category:English plurals in -i with singular in -us, -os or -o|Category:English plurals in ''-i'' with singular in ''-us'', ''-os'' or ''-o'']].",
		matches_plural = "i$",
		matches_lemma = function(pagename, lemma, stem)
			if pagename == lemma .. "i" or pagename == lemma:gsub("a$", "") .. "i" then
				return true
			end
		end,
	},
	{
		cat = "plurals in <<-ces>> with singular in <<-x>>",
		desc_suffix = ", mostly originating from Latin masculine or feminine nouns",
		additional = "Generally these are formed by changing a final <<-x>> into <<-ces>> or a final <<-ex>> into <<-ices>>.",
		matches_plural = "ces$",
		matches_lemma = "x$",
	},
	{
		cat = "plurals in <<-des>> with singular in <<-s>>",
		desc_suffix = ", mostly originating from Latin or Greek masculine or feminine nouns",
		matches_plural = "des$",
		matches_lemma = "s$",
	},
	{
		cat = "plurals in <<-des>> with singular in <<-d>>",
		desc_suffix = ", mostly originating from Latin or Greek masculine or feminine nouns, or Spanish feminine nouns",
		matches_plural = "des$",
		matches_lemma = "d$",
	},
	{
		cat = "plurals in <<-ges>> with singular in <<-x>>",
		desc_suffix = ", mostly originating from Greek masculine or feminine nouns",
		matches_plural = "ges$",
		matches_lemma = "x$",
	},
	{
		cat = "plurals in <<-ves>> with singular in <<-f>> or <<-fe>>",
		desc_suffix = ", mostly originating from native English formations",
		matches_plural = "ves$",
		matches_lemma = "fe?$",
	},
	{
		cat = "plurals in <<-ines>> with singular in <<-o>>",
		desc_suffix = ", mostly originating from Latin masculine or feminine nouns",
		matches_plural = "ines$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "o"
		end,
	},
	{
		cat = "plurals in <<-ones>> with singular in <<-o>>",
		desc_suffix = ", mostly originating from Latin masculine or feminine nouns",
		matches_plural = "ones$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "o"
		end,
	},
	{
		-- cf. [[eikon]] pl. [[eikones]]
		cat = "plurals in <<-ones>> with singular in <<-on>> or <<-ón>>",
		desc_suffix = ", mostly originating from Greek or Spanish masculine nouns",
		matches_plural = "ones$",
		matches_lemma = "[oó]n$",
	},
	{
		-- cf. [[levator]] pl. [[levatores]]
		cat = "plurals in <<-ores>> with singular in <<-or>>",
		desc_suffix = ", mostly originating from Latin masculine nouns",
		matches_plural = "ores$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "or"
		end,
	},
	{
		cat = "plurals in <<-res>> with singular in <<-r>>",
		desc_suffix = ", mostly originating from Greek, Latin, Portuguese, or Spanish masculine nouns",
		additional = "See also [[:Category:English plurals in -ores with singular in -or|Category:English plurals in ''-ores'' with singular in ''-or'']].",
		matches_plural = "res$",
		matches_lemma = "r$",
	},
	{
		cat = "plurals in <<-es>> with singular in <<-is>>",
		desc_suffix = ", mostly originating from Greek feminine nouns, or analogous formations",
		matches_plural = "es$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "is"
		end,
	},
	{
		cat = "plurals in <<-tes>> with singular in <<-s>>",
		desc_suffix = ", mostly originating from Latin or Greek masculine or feminine nouns",
		matches_plural = "tes$",
		matches_lemma = "s$",
	},
	{
		cat = "plurals in <<-tia>> with singular in <<-s>>",
		desc_suffix = ", mostly originating from Latin participles",
		matches_plural = "tia$",
		matches_lemma = "s$",
	},
	{
		cat = "plurals in <<-eis>> with singular in <<-is>>",
		desc_suffix = ", mostly originating from Greek feminine nouns",
		matches_plural = "eis$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "is"
		end,
	},
	{
		cat = "plurals in <<-ies>> with singular in <<-ey>>",
		matches_plural = "ies$",
		matches_lemma = "ey$",
	},
	{
		cat = "plurals in <<-oes>> with singular in <<-o>>",
		matches_plural = "oes$",
		matches_lemma = "o$",
	},
	{
		cat = "plurals in <<-'s>>",
		desc_suffix = ", mostly used where plurals ending in <<-s>> would appear strange or cause confusion",
		matches_plural = "'s$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem
		end,
	},
	{
		cat = "plurals in <<-z>>",
		desc_suffix = ", mostly as slang forms of plurals ending in <<-s>>",
		matches_plural = "z$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem
		end,
	},
	{ -- siphon off most of the "-sful" plurals that will otherwise end up in 'English miscellaneous irregular plurals'
		cat = "plurals in <<-sful>> with singular in <<-ful>>",
		additional = "This includes examples such as {{m|en|teaspoonful}}, plural {{m|en|teaspoonsful}}. Generally "
			.. "these refer to specific measures. Note that not all nouns in <<-ful>> pluralize this way; e.g. the "
			.. "plural of {{m|en|handful}} is normally {{m|en|handfuls}} (but {{m|en|handsful}} is possible, if rare).",
		matches_plural = "sful$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "ful" or
				-- [[boxesful]], [[brushesful]], [[busesful]], [[classesful]], [[dishesful]], [[glassesful]], etc.
				stem:find("e$") and lemma == stem:gsub("e$", "") .. "ful" or
				-- [[bakeriesful]], [[belliesful]], [[galleriesful]], [[librariesful]], [[pantriesful]], etc.
				stem:find("ie$") and lemma == stem:gsub("ie$", "y") .. "ful"
		end,
	},
	{ -- siphon off most of the "-people" plurals that will otherwise end up in 'English miscellaneous irregular plurals'
		cat = "plurals in <<-people>> with singular in <<-person>>",
		matches_plural = "people$",
		matches_lemma = function(pagename, lemma, stem)
			return lemma == stem .. "person"
		end,
	},
}


-- Find the single word that differs between `pagename` and `lemma`, assuming there are the same number of words in
-- both and the spaces and hyphens match. If there is a single word difference, return two values, the pagename word
-- and the lemma word. Otherwise return nil.
local function extract_non_matching_word(pagename, lemma)
	if not pagename:find("[ -]") and not lemma:find("[ -]") then
		return pagename, lemma
	end
	local pagename_words = split(pagename, "([ -])")
	local lemma_words = split(lemma, "([ -])")
	-- Make sure same number of words.
	if #pagename_words ~= #lemma_words then
		return nil
	end
	-- Make sure all the spaces and hyphens match.
	for i = 2, #pagename_words - 1, 2 do
		if pagename_words[i] ~= lemma_words[i] then
			return nil
		end
	end
	-- From the left, find first non-matching word.
	local non_matching_i, non_matching_j
	for i = 1, #pagename_words, 2 do
		if pagename_words[i] ~= lemma_words[i] then
			non_matching_i = i
			break
		end
	end
	-- From the right, find first non-matching word.
	for j = #pagename_words, 1, -2 do
		if pagename_words[j] ~= lemma_words[j] then
			non_matching_j = j
			break
		end
	end

	-- If pointers are the same, there's a single non-matching word.
	if non_matching_i == non_matching_j then
		return pagename_words[non_matching_i], lemma_words[non_matching_i]
	else
		return nil
	end
end


local function irregular_plural_categories(data)
	if not data.pagename or not data.lemmas then
		return nil
	end
	local categories = nil
	local function add_category(cat)
		if categories == nil then
			categories = cat
		elseif categories == cat then
			return
		else
			if type(categories) == "string" then
				categories = {"multi", categories}
			end
			require(table_module).insertIfNot(categories, cat)
		end
	end
	for _, lemma_obj in ipairs(data.lemmas) do
		if lemma_obj.term then
			local lemma = lemma_obj.term:gsub("^(.+)#.*", "$1") -- trim #Noun and similar; FIXME: this needs to handle unsupported titles with #
			if lemma == data.pagename then
				-- no category
			else
				local pagename_word, lemma_word = extract_non_matching_word(data.pagename, lemma)
				if pagename_word == nil then
					-- more than one word differs between singular and plural, or different numbers of words in
					-- singular vs. plural, or spaces/hyphens differ
					add_category("miscellaneous irregular plurals")
				-- Do nothing if the plural is regular.
				elseif not is_regular_plural(pagename_word, lemma_word) then
					local matches_lemma
					for _, irreg_plural in ipairs(irregular_plurals) do
						local stem
						if type(irreg_plural.matches_plural) == "string" then
							stem = umatch(pagename_word, "^(.*)" .. irreg_plural.matches_plural)
						else
							stem = irreg_plural.matches_plural(pagename_word, lemma_word)
						end
						if stem then
							if type(irreg_plural.matches_lemma) == "string" then
								matches_lemma = umatch(lemma_word, irreg_plural.matches_lemma)
							else
								matches_lemma = irreg_plural.matches_lemma(pagename_word, lemma_word, stem)
							end
							if matches_lemma then
								local cat = irreg_plural.cat:gsub("<<(.-)>>", "%1") -- discard second retval
								add_category(cat)
								break
							end
						end
					end
					if not matches_lemma then
						add_category("miscellaneous irregular plurals")
					end
				end
			end
		end
	end

	return categories
end

local cat_functions = {
	-- This function is invoked for plurals by an entry in [[Module:form of/cats]].
	["en-irregular-plural-categories"] = irregular_plural_categories,
}

-- We need to return the irreg_plurals structure so that the category handler in
-- [[Module:category tree/poscatboiler/data/lang-specific/en]] can access it.
return {cat_functions = cat_functions, irregular_plurals = irregular_plurals}