local headword_page_module = "Module:headword/page"
local list_to_set = require("Module:table").listToSet
local data = {}
------ 1. Lists which are converted into sets. ------
-- Zero-plurals (i.e. invariable plurals).
local irregular_plurals = list_to_set({
"cmavo",
"cmene",
"fu'ivla",
"gismu",
"Han tu",
"hanja",
"hanzi",
"jyutping",
"kana",
"kanji",
"lujvo",
"phrasebook",
"pinyin",
"rafsi",
"romaji",
}, function(item)
return item
end)
-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s",
-- because the module assumes that inputs ending in "s" are plurals.
for k, v in next, {
mora = "morae"
} do
irregular_plurals[k] = v
irregular_plurals[v] = v -- Ensures singular and plural inputs work as expected.
end
data.invariable = irregular_plurals -- To be removed.
data.irregular_plurals = irregular_plurals
data.lemmas = list_to_set{
"Kependekan",
"Akronim",
"Kata sifat",
"kata sifat",
"adnominals",
"adpositions",
"Kata adverba",
"Imbuhan",
"ambipositions",
"Kata sandang",
"Apitan",
"circumpositions",
"Penjodoh bilangan",
"cmavo",
"cmavo clusters",
"cmene",
"Bentuk gabungan",
"Kata penghubung",
"counters",
"Penunjuk",
"Tanda diakritik",
"Digraf",
"equative adjectives",
"fu'ivla",
"gismu",
"Aksara Han",
"Han tu",
"hanja",
"hanzi",
"ideophones",
"Simpulan bahasa",
"Sisipan",
"initialisms",
"Tanda lelaran",
"interfixes",
"Kata seru",
"kata seru",
"kana",
"kanji",
"Huruf",
"ligatur",
"Logogram",
"lujvo",
"morae",
"Morfem",
"non-constituents",
"Kata nama",
"kata nama",
"Nombor",
"Simbol angka",
"Kata bilangan",
"Partikel",
"Frasa",
"frasa",
"postpositions",
"postpositional phrases",
"predicatives",
"Awalan",
"Frasa sendi nama",
"Kata sendi nama",
"preverbs",
"pronominal adverbs",
"Kata ganti nama",
"kata ganti nama",
"Kata nama khas",
"kata nama khas",
"Peribahasa",
"Tanda baca",
"relatives",
"Akar",
"Kata dasar",
"Kata dasar",
"Akhiran",
"Suku kata",
"Simbol",
"Kata kerja",
"kata kerja",
}
data.nonlemmas = list_to_set{
"active participle forms",
"active participles",
"adjectival participles",
"adjective case forms",
"Bentuk kata sifat",
"Bentuk feminin kata sifat",
"Bentuk jamak kata sifat",
"Bentuk adverba",
"adverbial participles",
"agent participles",
"Bentuk artikel",
"Bentuk apitan",
"Bentuk gabungan",
"comparative adjective forms",
"comparative adjectives",
"comparative adverb forms",
"comparative adverbs",
"conjunction forms",
"contractions",
"converbs",
"determiner comparative forms",
"determiner forms",
"determiner superlative forms",
"diminutive nouns",
"elative adjectives",
"equative adjective forms",
"equative adjectives",
"future participles",
"gerund",
"infinitive forms",
"infinitives",
"interjection forms",
"jyutping",
"Kesalahan ejaan",
"negative participles",
"nominal participles",
"noun case forms",
"noun dual forms",
"Bentuk kata nama",
"noun paucal forms",
"Bentuk jamak kata nama",
"noun possessive forms",
"noun singulative forms",
"numeral forms",
"participles",
"participle forms",
"particle forms",
"passive participles",
"past active participles",
"past participles",
"past participle forms",
"past passive participles",
"perfect active participles",
"perfect participles",
"perfect passive participles",
"Pinyin",
"Jamak",
"postposition forms",
"Bentuk awalan",
"preposition contractions",
"preposition forms",
"prepositional pronouns",
"present active participles",
"present participles",
"present passive participles",
"Bentuk kata ganti nama",
"pronoun possessive forms",
"Bentuk kata nama khas",
"Bentuk jamak kata nama khas",
"rafsi",
"Perumian",
"root forms",
"singulatives",
"Bentuk akhiran",
"superlative adjective forms",
"Kata sifat superlatif",
"superlative adverb forms",
"superlative adverbs",
"bentuk kata kerja",
"verbal nouns",
}
-- These langauges will not have links to separate parts of the headword.
data.no_multiword_links = list_to_set{
"zh",
}
-- These languages will not have "LANG multiword terms" categories added.
data.no_multiword_cat = list_to_set{
-------- Languages without spaces between words (sometimes spaces between phrases) --------
"blt", -- Tai Dam
"ja", -- Japanese
"khb", -- Lü
"km", -- Khmer
"lo", -- Lao
"mnw", -- Mon
"my", -- Burmese
"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
"nan-hbl", -- Hokkien (some words in Latin script; hyphens between syllables)
"nod", -- Northern Thai
"ojp", -- Old Japanese
"shn", -- Shan
"sou", -- Southern Thai
"tdd", -- Tai Nüa
"th", -- Thai
"tts", -- Isan
"twh", -- Tai Dón
"txg", -- Tangut
"zh", -- Chinese (all varieties with Chinese characters)
"zkt", -- Khitan
-------- Languages with spaces between syllables --------
"ahk", -- Akha
"aou", -- A'ou
"atb", -- Zaiwa
"byk", -- Biao
"cdy", -- Chadong
--"duu", -- Drung; not sure
--"hmx-pro", -- Proto-Hmong-Mien
--"hnj", -- Green Hmong; not sure
"huq", -- Tsat
"ium", -- Iu Mien
--"lis", -- Lisu; not sure
"mtq", -- Muong
--"mww", -- White Hmong; not sure
"onb", -- Lingao
--"sit-gkh", -- Gokhy; not sure
--"swi", -- Sui; not sure
"tbq-lol-pro", -- Proto-Loloish
"tdh", -- Thulung
"ukk", -- Muak Sa-aak
"vi", -- Vietnamese
"yig", -- Wusa Nasu
"zng", -- Mang
-------- Languages with ~ with surrounding spaces used to separate variants --------
"mkh-ban-pro", -- Proto-Bahnaric
"sit-pro", -- Proto-Sino-Tibetan; listed above
-------- Other weirdnesses --------
"mul", -- Translingual; gestures, Morse code, etc.
"aot", -- Atong (India); bullet is a letter
-------- All sign languages --------
"ads",
"aed",
"aen",
"afg",
"ase",
"asf",
"asp",
"asq",
"asw",
"bfi",
"bfk",
"bog",
"bqn",
"bqy",
"bvl",
"bzs",
"cds",
"csc",
"csd",
"cse",
"csf",
"csg",
"csl",
"csn",
"csq",
"csr",
"doq",
"dse",
"dsl",
"ecs",
"esl",
"esn",
"eso",
"eth",
"fcs",
"fse",
"fsl",
"fss",
"gds",
"gse",
"gsg",
"gsm",
"gss",
"gus",
"hab",
"haf",
"hds",
"hks",
"hos",
"hps",
"hsh",
"hsl",
"icl",
"iks",
"ils",
"inl",
"ins",
"ise",
"isg",
"isr",
"jcs",
"jhs",
"jls",
"jos",
"jsl",
"jus",
"kgi",
"kvk",
"lbs",
"lls",
"lsl",
"lso",
"lsp",
"lst",
"lsy",
"lws",
"mdl",
"mfs",
"mre",
"msd",
"msr",
"mzc",
"mzg",
"mzy",
"nbs",
"ncs",
"nsi",
"nsl",
"nsp",
"nsr",
"nzs",
"okl",
"pgz",
"pks",
"prl",
"prz",
"psc",
"psd",
"psg",
"psl",
"pso",
"psp",
"psr",
"pys",
"rms",
"rsl",
"rsm",
"sdl",
"sfb",
"sfs",
"sgg",
"sgx",
"slf",
"sls",
"sqk",
"sqs",
"ssp",
"ssr",
"svk",
"swl",
"syy",
"tse",
"tsm",
"tsq",
"tss",
"tsy",
"tza",
"ugn",
"ugy",
"ukl",
"uks",
"vgt",
"vsi",
"vsl",
"vsv",
"xki",
"xml",
"xms",
"ygs",
"ysl",
"zib",
"zsl",
}
-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
data.hyphen_not_multiword_sep = list_to_set{
"akk", -- Akkadian; hyphens between syllables
"akl", -- Aklanon; hyphens for mid-word glottal stops
"ber-pro", -- Proto-Berber; morphemes separated by hyphens
"ceb", -- Cebuano; hyphens for mid-word glottal stops
"cnk", -- Khumi Chin; hyphens used in single words
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
"de", -- too many false positives
"esx-esk-pro", -- hyphen used to separate morphemes
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
"hil", -- Hiligaynon; hyphens for mid-word glottal stops
"ilo", -- Ilocano; hyphens for mid-word glottal stops
"kne", -- Kankanaey; hyphens for mid-word glottal stops
"lcp", -- Western Lawa; dash as syllable joiner
"lwl", -- Eastern Lawa; dash as syllable joiner
"mfa", -- Pattani Malay in Thai script; dash as syllable joiner
"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
"msb", -- Masbatenyo; too many false positives
"tl", -- Tagalog; too many false positives
"war", -- Waray-Waray; too many false positives
"yo", -- Yoruba; hyphens used to show lengthened nasal vowels
}
-- These languages will not have "LANG masculine nouns" and similar categories added.
data.no_gender_cat = list_to_set{
-- Languages without gender but which use the gender field for other purposes
"ja",
"th",
}
data.notranslit = list_to_set{
"ams",
"az",
"bbc",
"bug",
"cia",
"cjm",
"cmn",
"cpi",
"hak",
"ja",
"kzg",
"lad",
"lzh",
"ms",
"mul",
"mvi",
"nan",
"nan-hbl",
"nan-hnm",
"nan-luh",
"nan-tws",
"oj",
"okn",
"ryn",
"rys",
"ryu",
"sh",
"tgt",
"th",
"tkn",
"tly",
"txg",
"und",
"vi",
"xug",
"yoi",
"yox",
"yue",
"za",
"zh",
}
-- Script codes for which a script-tagged display title will be added.
data.toBeTagged = list_to_set{
"Ahom",
"Arab",
"fa-Arab",
"glk-Arab",
"kk-Arab",
"ks-Arab",
"ku-Arab",
"mzn-Arab",
"ms-Arab",
"ota-Arab",
"pa-Arab",
"ps-Arab",
"sd-Arab",
"tt-Arab",
"ug-Arab",
"ur-Arab",
"Armi",
"Armn",
"Avst",
"Bali",
"Bamu",
"Batk",
"Beng",
"as-Beng",
"Bopo",
"Brah",
"Brai",
"Bugi",
"Buhd",
"Cakm",
"Cans",
"Cari",
"Cham",
"Cher",
"Copt",
"Cprt",
"Cyrl",
"Cyrs",
"Deva",
"Dsrt",
"Egyd",
"Egyp",
"Ethi",
"Geok",
"Geor",
"Glag",
"Goth",
"Grek",
"Polyt",
"polytonic",
"Gujr",
"Guru",
"Hang",
"Hani",
"Hano",
"Hebr",
"Hira",
"Hluw",
"Ital",
"Java",
"Kali",
"Kana",
"Khar",
"Khmr",
"Knda",
"Kthi",
"Lana",
"Laoo",
"Latn",
"Latf",
"Latg",
"Latnx",
"Latinx",
"pjt-Latn",
"Lepc",
"Limb",
"Linb",
"Lisu",
"Lyci",
"Lydi",
"Mand",
"Mani",
"Marc",
"Merc",
"Mero",
"Mlym",
"Mong",
"mnc-Mong",
"sjo-Mong",
"xwo-Mong",
"Mtei",
"Mymr",
"Narb",
"Nkoo",
"Nshu",
"Ogam",
"Olck",
"Orkh",
"Orya",
"Osma",
"Ougr",
"Palm",
"Phag",
"Phli",
"Phlv",
"Phnx",
"Plrd",
"Prti",
"Rjng",
"Runr",
"Samr",
"Sarb",
"Saur",
"Sgnw",
"Shaw",
"Shrd",
"Sinh",
"Sora",
"Sund",
"Sylo",
"Syrc",
"Tagb",
"Tale",
"Talu",
"Taml",
"Tang",
"Tavt",
"Telu",
"Tfng",
"Tglg",
"Thaa",
"Thai",
"Tibt",
"Ugar",
"Vaii",
"Xpeo",
"Xsux",
"Yiii",
"Zmth",
"Zsym",
"Ipach",
"Music",
"Rumin",
}
-- Parts of speech which will not be categorised in categories like "English terms spelled with É" if
-- the term is the character in question (e.g. the letter entry for English [[é]]). This contrasts with
-- entries like the French adjective [[m̂]], which is a one-letter word spelled with the letter.
data.pos_not_spelled_with_self = list_to_set{
"Tanda diakritik",
"Aksara Han",
"Han tu",
"hanja",
"hanzi",
"Tanda lelaran",
"kana",
"kanji",
"Huruf",
"ligatur",
"Logogram",
"morae",
"Simbol angka",
"Kata bilangan",
"Tanda baca",
"Suku kata",
"Simbol",
}
------ 2. Lists not converted into sets. ------
-- Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
-- pluralized) form. It is singular so that the same table can be used in [[Module:form of]] for the p=/POS= param
-- and [[Module:links]] for the pos= param.
data.pos_aliases = {
a = "kata sifat",
adj = "kata sifat",
adv = "kata adverba",
art = "kata sandang",
det = "penunjuk",
cnum = "nombor kardinal",
conj = "kata hubung",
conv = "converb",
int = "kata seru",
interj = "kata seru",
intj = "kata seru",
n = "kata nama",
num = "angka",
part = "participle",
pcl = "partikel",
phr = "frasa",
pn = "kata nama khas",
postp = "postposition",
pre = "preposisi",
prep = "preposisi",
pro = "kata ganti nama",
pron = "kata ganti nama",
prop = "kata nama khas",
proper = "kata nama khas",
onum = "nombor ordinal",
v = "kata kerja",
vb = "kata kerja",
vi = "kata kerja tak transitif",
vt = "kata kerja transitif",
vti = "kata kerja transitif dan tak transitif",
}
-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
-- will be generated if the headword is of the appropriate gender/number.
data.pos_for_gender_number_cat = {
["Kata nama"] = "Kata nama",
["Kata nama khas"] = "Kata nama",
["Akhiran"] = "Akhiran",
-- We include verbs because impf and pf are valid "genders".
["Kata kerja"] = "Kata kerja",
}
------ 3. Page-wide processing (so that it only needs to be done once per page). ------
data.page = require(headword_page_module).process_page()
-- Fuckme, random references to data.pagename and data.encoded_pagename are scattered throughout the codebase. FIXME!
data.pagename = data.page.pagename
data.encoded_pagename = data.page.encoded_pagename
return data