mecab -F '%f[6]\t%t\t' -E '\n'|awk -F\\t 'NR==FNR{a[$0]=NR;next}{sum=0;for(i=1;i<NF-2;i+=2){if($(i+1)~/2|6/)sum+=a[$i]};print sum}' <(curl -s jptxt.net/word-frequency.txt|grep -v '^#'|cut -d\; -f1) -
The script uses MeCab to find lemmas of words, selects words of type 2 (at least one kanji) or 6 (hiragana), and sums the positions of the words on a word frequency list.
input = $<.read
IO.readlines("#{Dir.home}/Sites/jp/rtk-keywords.txt").grep(/^[^#]/)[0..2119].each { |k|
kanji, keyword = k.split(";").values_at(0, 2)
input.gsub!(/(?<=\b|[“”‘’…—])#{Regexp.escape(keyword)}(?=\b|[“”‘’…—])/i, kanji)
}
puts input
The output looks like this:
"An 醜 bit 之 古 metal," says the 聖 男 to the shopkeeper; "but it
will 為 井 enough to 煮 my humble drop 之 水 之 an 夕. 吾'll
呉 you 三 厘 for it." This 彼 did and took the kettle 宅,
rejoicing; for it was 之 bronze, fine 働, the very 物 for the
Cha-no-yu.
def kata2hira(x)
x.gsub(/[\u{30a1}-\u{30fa}]/) { [$&.ord - 96].pack("U") }
end
def amp(x)
x.gsub("&", "&").gsub("<", "<").gsub(">", ">")
end
def furigana(word, reading)
hira = kata2hira(reading)
if word == reading or reading == "" or word =~ /^[\u{3040}-\u{30ff}\u{ff00}-\u{ffef}]+$/
word
elsif word == hira
word
elsif word =~ /^[\u{4e00}-\u{9fff}]+$/
"<ruby><rb>#{amp(word)}</rb><rt>#{amp(hira)}</rt></ruby>"
else
groups = word.scan(/(?:[^\u{3040}-\u{30ff}]+|[\u{3040}-\u{30ff}]+)/)
regex = "^" + groups.map { |g|
if g =~ /^[\u{3040}-\u{30ff}]+$/
"(#{Regexp.escape(kata2hira(g))}|#{Regexp.escape(g)})"
else
"(.+?)"
end
}.join + "$"
kanagroups = hira.scan(Regexp.new(regex))[0]
return "<ruby><rb>#{amp(word)}</rb><rt>#{amp(hira)}</rt></ruby>" unless kanagroups
0.upto(groups.length - 1) { |i|
unless groups[i] =~ /[\u{3040}-\u{30ff}]/
groups[i] = "<ruby><rb>#{amp(groups[i])}</rb><rt>#{amp(kanagroups[i])}</rt></ruby>"
end
}
groups.join
end
end
if __FILE__ == $0
"次々 つぎつぎ
ユニークな ユニークな
痛い いたい
困難な こんなんな
言い訳 いいわけ
ごろごろ
カット かっと
くっ付ける くっつける
ジェット機 じぇっとき
湿っぽい しめっぽい
東京ドーム とうきょうドーム
3月 さんげつ
一ヶ月 いっかげつ
X線 エックスせん
八ッ橋 やつはし
4ヵ年 よんかねん
ィ形容詞 イけいようし
黄色い きいろい
物の怪 もののけ
鬼に金棒 おににかなぼう
千円貸してください せんえんかしてください".split("\n").each { |line|
puts furigana(*line.split(" ", 2))
}
end
This script doesn't work with some words like 物の怪 (もののけ) or 鬼に金棒 (おににかなぼう). It can also be used with sentences, but for example the last test case is converted to <ruby><rb>千円貸</rb><rt>せんえんか</rt></ruby>してください.
require "./furigana"
def mecab_furigana(line)
IO.popen(["mecab", "-F%m\\t%f[7]\\n", "-U%m\\t\\n", "-E", ""], "r+") { |io|
io.puts line
io.close_write
io.read
}.lines.map { |l| furigana(*l.chomp.split("\t", 2)) }.join
end
"「IT」は何の略か知っていますか。
この綱は直径20cmあるそうです。
妹は来年、二十歳になります。
今日の新聞、どこに置いた?
3月は仕事が忙しい。
彼は数学の博士だそうです。
彼女はOLです。
工事は3月まで続きます。
定価から2000円割り引きますよ。
私の国について少しお話しましょう。
東京ドーム
10ヶ国
12ヶ月
どうしよ~。
X線
No.2
命の親
〆切".split.each { |line|
puts mecab_furigana(line)
}
Anki's Japanese support plugin also uses MeCab to generate furigana. See https://github.com/dae/ankiplugins/blob/master/japanese/reading.py.
ffmpeg -v quiet -t 1.5 -f s16le -i /dev/zero /tmp/0.wav
i=10000
while read index sentence translation; do
sentence=${sentence//<b>}
output+="<div><div>${sentence//<\/b>}</div><div>$translation</div></div>"$'\n'
ffmpeg -v quiet -i ~/japanese/dir/iknow/sentences/$(printf %04d $index).mp3 /tmp/$((i++)).wav < /dev/null
cp /tmp/0.wav /tmp/$((i++)).wav
done < <(tail -n4000 ~/Sites/jp/core-6000.txt | awk -F\; '$12>5000{print $1,$5,$7}' | gshuf -n150)
html=/tmp/core-sentences.html
echo "$output" | ruby -i -e 'print gets(nil).sub /<div.*div>\n/m,STDIN.read' "$html"
sox /tmp/1*.wav /tmp/0.wav
ffmpeg -v quiet -i /tmp/0.wav -aq 150 -y /tmp/core-sentences.m4a
rm /tmp/*.wav
open "$html"
rtk = Hash[IO.readlines("#{Dir.home}/Sites/jp/rtk-keywords.txt").grep(/^[^#]/).take(2200).map { |l|
l.split(";").values_at(0, 2)
}]
kanji = rtk.keys.join
IO.foreach("#{Dir.home}/japanese/data/edict.txt") { |l|
c = l.scan(/^([#{kanji}]{2}) \[(.*?)\] \/(.*?)\//)
next if c == []
c[2] = c[2].sub(/^(\([^)]*\) )*/, "").sub(/ \([^)]*\)$/, "").sub(/^to /, "")
next unless rtk[c[0][0]] + " " + rtk[c[0][1]] == c[2]
puts c[1].rjust(5, "\u{3000}") + " " + c[0] + " " + c[2]
}
The output looks like this:
とくぎ 特技 special skill
のうは 脳波 brain waves
さんそう 山荘 mountain villa
cd "${0%/*}"
i=1000
while read l; do
IFS=\; c=($l)
f=~/japanese/files/pod/${c[1]}.mp3
[[ -e $f ]] || continue
ffmpeg -v 0 -i $f -ar 22050 /tmp/$((i++)).aif < /dev/null
say "[[slnc 0]][[volm 0.6]]${c[2]}[[slnc 800]]" -v alex -o /tmp/$((i++)).aif
done < <(gshuf ~/japanese/review/hiragana-vocabulary.txt)
sox /tmp/*.aif /tmp/0.aif
ffmpeg -i /tmp/0.aif -c:a libfaac -q 150 -y ~/japanese/review/hiragana-audio.m4a
rm /tmp/*.aif
Dir.chdir(__dir__)
require "./furigana"
edict = {}
IO.read("../data/JMdict_e").scan(/<entry>.*?<\/entry>/m).reverse.each { |entry|
keb = entry[/(?<=<keb>).*(?=<\/keb>)/] || next
reb = entry[/(?<=<reb>).*(?=<\/reb>)/]
gloss = entry[/(?<=<gloss>).*(?=<\/gloss>)/]
gloss = gloss.sub(/^\(.*?\) */, "").sub(/ \(.*?\)$/, "").sub(/^to /, "")
edict[keb] = [reb, gloss]
}
freq = Hash[IO.read("#{Dir.home}/Sites/jp/word-frequency.txt").split[20000..200000].map { |w| [w, nil] }]
rtk = IO.readlines("#{Dir.home}/Sites/jp/rtk-keywords.txt").grep(/^[^#]/).take(2200).map { |l| l[0] }.join
learned = Dir["../data/words[012].txt"].map { |f| IO.read(f).split }.flatten
words = `for f in ~/desktop/*.srt; do mecab -F '%t %f[6]\\n' "$f"; done | awk '$1=="2"{print $2}'`
output = ""
(words.split - learned).uniq.shuffle.each { |word|
next unless freq.key?(word)
next unless word =~ /^[\u{3040}-\u{309f}#{rtk}]{2,}$/
reb, gloss = edict[word] || next
next unless gloss =~ /^[a-z -]{1,18}$/
output << "<div onclick=\"highlight(this)\"><div>#{furigana(word, reb)}</div><div>#{gloss}</div></div>\n"
}
exit if output == ""
f = "../review/episodes.html"
IO.write(f, IO.read(f).sub(/<div.*<\/div>/m, output))
system("open", f)
This script generates an HTML file for reviewing uncommon words in subtitle files or other Japanese text.
freq = {}
IO.read("#{Dir.home}/Sites/jp/word-frequency.txt").split[10000..-1].each { |w|
freq[w] = nil
}
edict = {}
IO.read("../data/JMdict_e.xml").scan(/<entry>.*?<\/entry>/m).each { |entry|
keb = entry[/(?<=<keb>).*(?=<\/keb>)/] || next
next if edict[keb]
next unless freq.key?(keb)
gloss = entry[/(?<=<gloss>).*(?=<\/gloss>)/]
gloss = gloss.sub(/^\(.*?\) */, "").sub(/ \(.*?\)$/, "").sub(/^to /, "")
next if gloss.length > 20
edict[keb] = gloss
}
Dir["#{Dir.home}/Desktop/*.srt"].each { |f|
out = ""
IO.read(f).gsub("\r", "").split("\n\n").each { |s|
id, time, subs = s.split("\n", 3)
out << id + "\n" + time + "\n"
IO.popen(["mecab", "-F%M\t%f[6]\n", "-U%M\n", "-E", "EOS\n"], "r+") { |io|
io.puts subs
io.close_write
io.read
}.split("\n").each { |morpheme|
if morpheme == "EOS"
out << "\n"
elsif morpheme =~ /(.+)\t(.+)/
if english = edict[$2]
out << " " + $1 + " " + english + " "
else
out << $1
end
else
out << morpheme
end
}
out << "\n"
}
IO.write(f, out.gsub!(/^ | $/, ""))
}
This script modifies Japanese srt subtitles to add translations after uncommon words:
778
01:03:09,196 --> 01:03:13,200
上巻 first volume 下巻 last volume じゃなくて
上中下だって。
#!/usr/bin/env ruby
rtk = Hash[IO.readlines("#{Dir.home}/Sites/jp/rtk-keywords.txt").grep(/^[^#]/).take(3000).map { |l|
l.split(";").values_at(2, 0)
}]
puts (ARGV[0] || STDIN.read).split(/[,\n]/m).map { |k| rtk[k] }.join
This script converts RTK keywords to kanji:
$ rtk Sino-,character
漢字
I used this script to generate the first eight columns in kanji.txt.
require "nokogiri"
xml = IO.read("#{Dir.home}/japanese/data/kanjidic2.xml")
Nokogiri.XML(xml).css("character").each { |e|
puts [
e.css("literal").text,
e.css("reading[r_type='ja_on']").map(&:text).join(" "),
e.css("reading[r_type='ja_kun']").map(&:text).join(" "),
e.css("nanori").map(&:text).join(" "),
e.css("meaning:not([m_lang])").map(&:text).join(", "),
e.css("grade").text,
e.css("stroke_count").text,
e.css("rad_value[rad_type='classical']").text
].join("\t")
}
#!/usr/bin/env bash
cd "${0%/*}"
LC_ALL=en_US.UTF-8
trap onexit EXIT
clear
onexit() {
echo
clear
printf %s "$out" | while read l; do
[[ ${l:11:1} = x ]] && printf %s "${l:13:1} ${l:15} "
done
printf '\b\n'
}
IFS=$';\n' read -d '' -a keywords < <(grep -v '^#' ~/Sites/jp/rtk-keywords.txt | head -n2200 | cut -d\; -f1,3)
while :; do
let i++
framenumber=$(($RANDOM$RANDOM%(${#keywords[@]}/2)))
kanji=${keywords[$framenumber*2]}
keyword=${keywords[$framenumber*2+1]}
pad=$(printf %$(($(tput cols)/2-7))s)
read -ep "$pad$kanji " -n ${#keyword} answer
if [[ $answer = $keyword ]]; then
status=o
else
clear
echo "$pad$kanji $keyword"
sleep 2
clear
status=x
read -d '' -t0.001 -n99999
printf '\e[2K\r'
fi
repetition="$(date +%s) $status $kanji $keyword"
echo "$repetition" >> repetitions.txt
out+="$repetition"$'\n'
[[ $i = 50 ]] && exit
done
edict = {}
IO.read("#{Dir.home}/japanese/data/JMdict_e").scan(/<entry>.*?<\/entry>/m).each { |entry|
keb = entry[/(?<=<keb>).*(?=<\/keb>)/] || next
next if edict[keb]
reb = entry[/(?<=<reb>).*(?=<\/reb>)/]
gloss = entry[/(?<=<gloss>).*(?=<\/gloss>)/]
gloss = gloss.sub(/^\(.*?\) */, "").sub(/ \(.*?\)$/, "").sub(/^to /, "")
edict[keb] = [reb, gloss]
}
rtk = {}
IO.read("#{Dir.home}/Sites/jp/rtk-keywords.txt").split("\n").grep(/^[^#]/).take(2200).each { |l|
c = l.split(";")
rtk[c[0]] = c[2]
}
kanji = rtk.keys.join
learned = {}
Dir["#{Dir.home}/japanese/data/words[012].txt"].each { |f|
IO.foreach(f) { |w| learned[w.chomp] = nil }
}
i = 10000
IO.readlines("#{Dir.home}/Sites/jp/word-frequency.txt")[30000..100000].map { |l|
l.split(";")[0]
}.each { |word|
next unless word =~ /^[#{kanji}]{2}$/
next if learned.key?(word)
next unless edictword = edict[word]
next unless edictword[1] =~ /^[a-z '-]{1,20}/
pod = "#{Dir.home}/japanese/files/pod/#{word} #{edictword[0]}.mp3"
next unless File.exist?(pod)
`ffmpeg -v quiet -i '#{pod}' -ar 22050 -y /tmp/two/#{i += 1}\\ #{word}\\ #{edictword[0]}.aif`
system("say", "[[volm 0.45]]#{rtk[word[0]]} #{rtk[word[1]]}[[slnc 200]]#{edictword[1]}", "-o", "/tmp/two/#{i += 1}.aif")
`ffmpeg -v quiet -i '#{pod}' -ar 22050 -y /tmp/two/#{i += 1}.aif`
}
`sox /tmp/*.aif ~/Japanese/review/freqtwoaudio.aif
rm /tmp/*.aif`
edict = {}
IO.read("#{Dir.home}/japanese/data/JMdict_e").scan(/<entry>.*?<\/entry>/m).each { |entry|
keb = entry[/(?<=<keb>).*(?=<\/keb>)/] || next
next if edict[keb]
reb = entry[/(?<=<reb>).*(?=<\/reb>)/]
gloss = entry[/(?<=<gloss>).*(?=<\/gloss>)/]
gloss = gloss.sub(/^\(.*?\) */, "").sub(/ \(.*?\)$/, "").sub(/^to /, "")
edict[keb] = [reb, gloss]
}
freq = {}
IO.foreach("#{Dir.home}/sites/jp/word-frequency.txt").grep(/^[^#]/)[10000..50000].each { |l|
freq[l.split(";")[0]] = nil
}
rtk = Hash[IO.foreach("#{Dir.home}/sites/jp/rtk-keywords.txt").grep(/^[^#]/).take(2200).map { |l|
l.split(";").values_at(0, 2)
}]
words = []
IO.read("#{Dir.home}/japanese/data/edict.txt").scan(/^([#{rtk.keys.join}]{2}) \[(.*?)\] \/(.*?)\//).each { |w|
next unless freq.key?(w[0])
w[2] = w[2].sub(/^(\(.*?\) )*/, "").sub(/ \([^)]*?\)$/, "").sub(/^to /, "")
next unless w[2] =~ /^[a-z -]{1,20}$/
words << w
}
kana = words.transpose[1].uniq
output = ""
audio = []
kana.shuffle.each { |k|
found = words.select { |w| w[1] == k }
found = found.select { |w| found.transpose[2].count(w[2]) == 1 }.sample(4)
next unless found.size >= 2
output << "<div><span>#{k}</span>\n"
found.each { |f|
output << "<div><div>#{f[0]}</div><div>#{f[2]}</div></div>\n"
}
output << "</div>\n"
}
exit if output == ""
f = "#{Dir.home}/Sites/jp/printable-homophones.html"
IO.write(f, IO.read(f).sub(/<div.*\/div>\n/m, output))
edict = {}
IO.read("#{Dir.home}/japanese/data/JMdict_e").scan(/<entry>.*?<\/entry>/m).each { |entry|
keb = entry[/(?<=<keb>).*(?=<\/keb>)/] || next
next if edict[keb]
reb = entry[/(?<=<reb>).*(?=<\/reb>)/]
gloss = entry[/(?<=<gloss>).*(?=<\/gloss>)/]
gloss = gloss.sub(/^\(.*?\) */, "").sub(/ \(.*?\)$/, "").sub(/^to /, "")
edict[keb] = [reb, gloss]
}
rtk = IO.readlines("#{Dir.home}/Sites/jp/rtk-keywords.txt").grep(/^[^#]/).take(2200).map { |l| l[0] }.join
freq = IO.read("#{Dir.home}/Sites/jp/word-frequency.txt").scan(/^[^#;]+/)[15000..40000]
core = IO.read("#{Dir.home}/Sites/jp/core-6000.txt").scan(/^[^#;]+/)
output = ""
(freq - core).shuffle.each { |word|
next unless word =~ /^[#{rtk}]{2}$/
next unless edictword = edict[word]
next unless edictword[1] =~ /^[a-z -]{1,20}$/
output << "<div><div>#{edictword[0]}</div><div>#{word}</div><div>#{edictword[1]}</div></div>\n"
}
f = "#{Dir.home}/Sites/jp/printable-two-kanji.html"
IO.write(f, IO.read(f).sub(/<div.*<\/div>\n/m, output))