local m_str_utils = require("Module:string utilities")
local export = {}
local codepoint = m_str_utils.codepoint
local concat = table.concat
local find = string.find
local get_by_code = require("Module:languages").getByCode
local insert = table.insert
local load_data = mw.loadData
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = m_str_utils.char
local ugsub = m_str_utils.gsub
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local umatch = mw.ustring.match
local usub = m_str_utils.sub
-- note that arrays loaded by mw.loadData cannot be directly used by gsub
local pagename -- generated when needed, to avoid an infinite loop with [[Module:Jpan-sortkey]]
local namespace = mw.title.getCurrentTitle().nsText
local data = load_data("Module:ja/data")
local long_vowels_hira = data.long_vowels_hira
local long_vowels_kata = data.long_vowels_kata
local voice_marks = data.voice_marks
local range = load_data("Module:ja/data/range")
local r_hiragana = range.hiragana
local r_vowels = range.vowels
local r_kana_combining_characters = range.kana_combining_characters
local function change_codepoint(added_value)
return function(char)
return u(codepoint(char) + added_value)
end
end
function export.hira_to_kata(text)
if type(text) == "table" then
text = text.args[1]
end
text = ugsub(toNFD(text), "[ぁ-ゖゝゞ]", change_codepoint(96))
text = ugsub(text, "[𛅐-𛅒]", change_codepoint(20))
text = ugsub(text, "[𛀁𛀆𛄟𛄲]", data.hira_to_kata)
return toNFC(text)
end
function export.kata_to_hira(text)
if type(text) == "table" then
text = text.args[1]
end
text = ugsub(toNFD(text), "[ァ-ヶヽヾ]", change_codepoint(-96))
text = ugsub(text, "[𛅤-𛅦]", change_codepoint(-20))
text = ugsub(text, "[𛀀𛄠-𛄢𛅕]", data.kata_to_hira)
return toNFC(text)
end
-- removes spaces and hyphens from input
-- intended to be used when checking manual romaji to allow the
-- insertion of spaces or hyphens in manual romaji without appearing "wrong"
function export.rm_spaces_hyphens(f)
local text = type(f) == "table" and f.args[1] or f
return (text:gsub("[ '%-.]+", "")
:gsub(" ", ""))
end
do
local function handle_macron(ch)
return ch == "o" and "ou" or ch .. ch
end
function export.romaji_to_kata(f)
local text = type(f) == "table" and f.args[1] or f
text = ulower(toNFD(text))
text = text:gsub("(.[\128-\191]*)\204\132", handle_macron)
:gsub("(.)%1", "ッ%1")
:gsub("tc", "ッc")
:gsub("tsyu", "ツュ")
:gsub("ts[uoiea]", {["tsu"]="ツ",["tso"]="ツォ",["tsi"]="ツィ",["tse"]="ツェ",["tsa"]="ツァ"})
:gsub("sh[uoiea]", {["shu"]="シュ",["sho"]="ショ",["shi"]="シ",["she"]="シェ",["sha"]="シャ"})
:gsub("ch[uoiea]", {["chu"]="チュ",["cho"]="チョ",["chi"]="チ",["che"]="チェ",["cha"]="チャ"})
:gsub("n[uoiea']?", {["nu"]="ヌ",["no"]="ノ",["ni"]="ニ",["ne"]="ネ",["na"]="ナ"})
:gsub("[wvtrpsnmlkjhgfdbzy][yw]?[uoiea]", data.rk)
:gsub("n'?", "ン")
:gsub("[aeiou]", {
u = "ウ", o = "オ", i = "イ", e = "エ", a = "ア"
})
return text
end
end
-- expects: any mix of kanji and kana
-- determines the script types used
-- e.g. given イギリス人, it returns Kana+Hani
function export.script(f)
local text = type(f) == "table" and f.args[1] or f
local script = {}
-- For Hira and Kana, we remove any characters which also feature in the other first, so that we don't get false positives for ー etc.
local no_overlap = ugsub(text, "[" .. range.kana_overlap .. "]+", "")
if umatch(no_overlap, "[" .. r_hiragana .. "ゟ]") then
insert(script, "Hira")
end
if umatch(no_overlap, "[" .. range.katakana .. "ヿ]") then
insert(script, "Kana")
end
if umatch(text, "[" .. range.kanji .. "]") then
insert(script, "Hani")
end
if umatch(text, "[" .. range.latin .. "]") then
insert(script, "Romaji")
end
if umatch(text, "[" .. range.numbers .. "]") then
insert(script, "Number")
end
if umatch(text, "[〆々]") then
insert(script, "Abbreviation")
end
return concat(script, "+")
end
do
local submoraic = range.submoraic_kana .. r_kana_combining_characters
local spacing_punc = "%s%p%$%+=>%^`|~"
local function handle_spacing_punc(ch, mora)
insert(mora, ch)
if ch:match("[^%^%%']") then
mora.sp = true
end
return ch, mora
end
local function iterate_mora(text, start, morae, mora)
mora = mora or {}
local ch = umatch(text, "^[" .. spacing_punc .. "]+", start)
if ch then
return handle_spacing_punc(ch, mora)
end
ch = usub(text, start, start)
if ch == "<" then
ch = umatch(text, "^<.->", start) or umatch(text, "^[<" .. spacing_punc .. "]+", start)
return handle_spacing_punc(ch, mora)
elseif (
mora.sp or
mora.kana and umatch(ch, "[^" .. submoraic .. "]")
) then
insert(morae, concat(mora))
mora = {}
end
mora.kana = true
insert(mora, ch)
return ch, mora
end
-- Returns an array of morae.
-- Small vowel kana (and any combining dakuten/handakuten) are grouped with any prior word characters, which should be kana. Non-word characters (spaces, punctuation etc.) are accounted for, and grouped with surrounding morae wherever possible.
function export.moraify(text)
local morae, start, text_len, mora = {}, 1, ulen(text)
while start <= text_len do
local ch
ch, mora = iterate_mora(text, start, morae, mora)
start = start + ulen(ch)
end
if mora then
insert(morae, concat(mora))
end
return morae
end
local function remove_formatting(text)
return ugsub(text:gsub("<.->", ""), "[<" .. spacing_punc .. "]+", "")
end
-- Counts the number of morae.
function export.count_morae(text)
text = export.moraify(text)
local morae = #text
for i = 1, morae do
if #remove_formatting(text[i]) == 0 then
morae = morae - 1
end
end
return morae
end
local function do_long_vowel(i, text)
if not text[i]:find("ー") then
return
end
local prev = text[i - 1]
if not prev then
return
end
prev = ugsub(remove_formatting(prev), "[" .. r_kana_combining_characters .. "]+", "")
:match("[^\128-\191][\128-\191]*$")
for vowel, kana in pairs(r_vowels) do
if kana:find(prev) then
local v = (umatch(prev, "[" .. r_hiragana .. "]") and long_vowels_hira or long_vowels_kata)[vowel]
text[i] = text[i]:gsub("ー", v, 1)
end
end
end
local function do_iteration_mark(i, n, text)
local mora = text[i]
if mora:find("ゝ") or mora:find("ヽ") then
return n + 1
elseif n == 0 then
return
end
-- Count backwards once for each iteration mark, but stop early if we find something which can't be iterated, as that marks the start of the set to be repeated.
local anchor = i
for j = 0, n - 1 do
local prev = text[anchor - j]
if not prev then
n = j
break
end
prev = remove_formatting(prev)
if prev:find("ゝ") or prev:find("ヽ") or umatch(prev, "[%s%p]") then
n = j
break
end
end
if n == 0 then
return
end
i = i - n + 1
-- Replace iteration marks ahead with the relevant character.
for j = i, i + n - 1 do
mora = remove_formatting(text[j]):gsub("^(.[\128-\191]*)\227\130[\153\154]", "%1")
text[j + n] = ugsub(text[j + n], "([ゝヽ])([゙゚]?)", function(mark, voicing)
local repl = mora:gsub("^.[\128-\191]*", "%0" .. voicing)
return mark == "ゝ" and export.kata_to_hira(repl) or export.hira_to_kata(repl)
end)
end
return
end
-- Normalizes long vowels, iteration marks and non-combining voice marks to the standard equivalents.
-- Note: output text is normalized to NFD.
function export.normalize_kana(text)
text = export.moraify((toNFD(text):gsub("[\227\239][\130\190][\155\156\158\159]", voice_marks)))
local n, morae = 0, #text
for i = morae, 1, -1 do
n = do_iteration_mark(i, n, text) or 0
end
for i = 1, morae do
do_long_vowel(i, text)
end
-- Normalize again to be safe.
return toNFD(concat(text))
end
end
-- returns the "stem" of a verb or -i adjective, that is the term minus the final character
function export.definal(f)
return usub(f.args[1], 1, -2)
end
function export.remove_ruby_markup(text)
return (text:gsub("[%^%-%. %%]", ""))
end
-- do the work of [[Template:ja-kanji]], [[Template:ryu-kanji]] etc.
-- should probably be folded into [[Module:Jpan-headword]]
function export.kanji(frame)
pagename = pagename or load_data("Module:headword/data").pagename
-- only do this if this entry is a kanji page and not some user's page
if namespace == "" then
local params = {
grade = {}, -- To be removed.
rs = {},
shin = {},
kyu = {},
head = {},
}
local lang_code = frame.args[1]
local lang = get_by_code(lang_code)
local lang_name = lang:getCanonicalName()
local args = require("Module:parameters").process(frame:getParent().args, params, nil, "ja", "kanji")
local sortkey = args.rs or require("Module:Hani-sortkey").makeSortKey(pagename) or pagename -- radical sort
local shin = args.shin
local kyu = args.kyu
local wikitext, categories = {}, {}
-- display the kanji itself at the top at 275% size
insert(wikitext, "<div><span lang=\"" .. lang_code .. "\" class=\"Jpan\" style=\"font-size:275%; line-height:1;\">" .. (args.head or pagename) .. "</span></div>")
-- display information for the grade
-- determine grade
local grade, in_parenthesis = export.kanji_grade(pagename), {}
insert(in_parenthesis, data.grade_links[grade])
if args.grade then
require("Module:debug/track")("ja/redundant grade parameter")
end
-- link to shinjitai if shinjitai was specified, and link to kyujitai if kyujitai was specified
if kyu then
insert(in_parenthesis, "[[shinjitai]] kanji, [[kyūjitai]] form <span lang=\"" .. lang_code .. "\" class=\"Jpan\">[[" .. kyu .. "#" .. lang_name .. "|" .. kyu .. "]]</span>")
elseif shin then
insert(in_parenthesis, "[[kyūjitai]] kanji, [[shinjitai]] form <span lang=\"" .. lang_code .. "\" class=\"Jpan\">[[" .. shin .. "#" .. lang_name .. "|" .. shin .. "]]</span>")
end
insert(wikitext, "''(" .. concat(in_parenthesis, ", ") .. "'')")
-- add categories
insert(categories, lang_name .. " kanji")
insert(categories, lang_name .. " " .. data.grades[grade])
if grade <= 6 then
insert(categories, lang_name .. " kyōiku kanji")
insert(categories, lang_name .. " jōyō kanji") -- Grade 7 get this from the data.
end
if mw.title.new(lang_name .. " terms spelled with " .. pagename, 14).exists then
insert(wikitext, 1, "<div class=\"noprint floatright catlinks\" style=\"font-size: 90%; width: 270px\"><div style=\"padding:0 5px\"><i>See also:</i><div style=\"margin-left: 10px;\">[[:Category:" .. lang_name .. " terms spelled with " .. pagename .. "]]</div></div></div>")
end
return concat(wikitext) .. require("Module:utilities").format_categories(categories, lang, sortkey)
end
end
function export.kanji_grade(kanji)
for i, set in ipairs(data.grade_kanji) do
if find(set, kanji, 1, true) then
return i
end
end
return umatch(kanji, "[" .. range.kanji .. "]") and 9 or false
end
return export