sentence-difficulty.sh

mecab -F '%f[6]\t%t\t' -E '\n'|awk -F\\t 'NR==FNR{a[$0]=NR;next}{sum=0;for(i=1;i<NF-2;i+=2){if($(i+1)~/2|6/)sum+=a[$i]};print sum}' <(curl -s jptxt.net/word-frequency.txt|grep -v '^#'|cut -d\; -f1) -

The script uses MeCab to find lemmas of words, selects words of type 2 (at least one kanji) or 6 (hiragana), and sums the positions of the words on a word frequency list.

rtk-english.rb

input = $<.read
IO.readlines("#{Dir.home}/Sites/jp/rtk-keywords.txt").grep(/^[^#]/)[0..2119].each { |k|
  kanji, keyword = k.split(";").values_at(0, 2)
  input.gsub!(/(?<=\b|[“”‘’…—])#{Regexp.escape(keyword)}(?=\b|[“”‘’…—])/i, kanji)
}
puts input

The output looks like this:

"An 醜 bit 之 古 metal," says the 聖 男 to the shopkeeper; "but it
will 為 井 enough to 煮 my humble drop 之 水 之 an 夕. 吾'll
呉 you 三 厘 for it." This 彼 did and took the kettle 宅,
rejoicing; for it was 之 bronze, fine 働, the very 物 for the
Cha-no-yu.

furigana.rb

def kata2hira(x)
  x.gsub(/[\u{30a1}-\u{30fa}]/) { [$&.ord - 96].pack("U") }
end

def amp(x)
  x.gsub("&", "&amp;").gsub("<", "&lt;").gsub(">", "&gt;")
end

def furigana(word, reading)
  hira = kata2hira(reading)
  if word == reading or reading == "" or word =~ /^[\u{3040}-\u{30ff}\u{ff00}-\u{ffef}]+$/
    word
  elsif word == hira
    word
  elsif word =~ /^[\u{4e00}-\u{9fff}]+$/
    "<ruby><rb>#{amp(word)}</rb><rt>#{amp(hira)}</rt></ruby>"
  else
    groups = word.scan(/(?:[^\u{3040}-\u{30ff}]+|[\u{3040}-\u{30ff}]+)/)
    regex = "^" + groups.map { |g|
      if g =~ /^[\u{3040}-\u{30ff}]+$/
        "(#{Regexp.escape(kata2hira(g))}|#{Regexp.escape(g)})"
      else
        "(.+?)"
      end
    }.join + "$"
    kanagroups = hira.scan(Regexp.new(regex))[0]
    return "<ruby><rb>#{amp(word)}</rb><rt>#{amp(hira)}</rt></ruby>" unless kanagroups
    0.upto(groups.length - 1) { |i|
      unless groups[i] =~ /[\u{3040}-\u{30ff}]/
        groups[i] = "<ruby><rb>#{amp(groups[i])}</rb><rt>#{amp(kanagroups[i])}</rt></ruby>"
      end
    }
    groups.join
  end
end

if __FILE__ == $0
  "次々 つぎつぎ
ユニークな ユニークな
痛い いたい
困難な こんなんな
言い訳 いいわけ
ごろごろ
カット かっと
くっ付ける くっつける
ジェット機 じぇっとき
湿っぽい しめっぽい
東京ドーム とうきょうドーム
3月 さんげつ
一ヶ月 いっかげつ
X線 エックスせん
八ッ橋 やつはし
4ヵ年 よんかねん
ィ形容詞 イけいようし
黄色い きいろい
物の怪 もののけ
鬼に金棒 おににかなぼう
千円貸してください せんえんかしてください".split("\n").each { |line|
    puts furigana(*line.split(" ", 2))
  }
end

This script doesn't work with some words like 物の怪 (もののけ) or 鬼に金棒 (おににかなぼう). It can also be used with sentences, but for example the last test case is converted to <ruby><rb>千円貸</rb><rt>せんえんか</rt></ruby>してください.

mecab-furigana.rb

require "./furigana"

def mecab_furigana(line)
  IO.popen(["mecab", "-F%m\\t%f[7]\\n", "-U%m\\t\\n", "-E", ""], "r+") { |io|
    io.puts line
    io.close_write
    io.read
  }.lines.map { |l| furigana(*l.chomp.split("\t", 2)) }.join
end

"「IT」は何の略か知っていますか。
この綱は直径20cmあるそうです。
妹は来年、二十歳になります。
今日の新聞、どこに置いた?
3月は仕事が忙しい。
彼は数学の博士だそうです。
彼女はOLです。
工事は3月まで続きます。
定価から2000円割り引きますよ。
私の国について少しお話しましょう。
東京ドーム
10ヶ国
12ヶ月
どうしよ~。
X線
No.2
命の親
〆切".split.each { |line|
  puts mecab_furigana(line)
}

Anki's Japanese support plugin also uses MeCab to generate furigana. See https://github.com/dae/ankiplugins/blob/master/japanese/reading.py.

core-sentences.sh

ffmpeg -v quiet -t 1.5 -f s16le -i /dev/zero /tmp/0.wav
i=10000
while read index sentence translation; do
  sentence=${sentence//<b>}
  output+="<div><div>${sentence//<\/b>}</div><div>$translation</div></div>"$'\n'
  ffmpeg -v quiet -i ~/japanese/dir/iknow/sentences/$(printf %04d $index).mp3 /tmp/$((i++)).wav < /dev/null
  cp /tmp/0.wav /tmp/$((i++)).wav
done < <(tail -n4000 ~/Sites/jp/core-6000.txt | awk -F\; '$12>5000{print $1,$5,$7}' | gshuf -n150)
html=/tmp/core-sentences.html
echo "$output" | ruby -i -e 'print gets(nil).sub /<div.*div>\n/m,STDIN.read' "$html"
sox /tmp/1*.wav /tmp/0.wav
ffmpeg -v quiet -i /tmp/0.wav -aq 150 -y /tmp/core-sentences.m4a
rm /tmp/*.wav
open "$html"

rtk-compounds.rb

rtk = Hash[IO.readlines("#{Dir.home}/Sites/jp/rtk-keywords.txt").grep(/^[^#]/).take(2200).map { |l|
  l.split(";").values_at(0, 2)
}]
kanji = rtk.keys.join
IO.foreach("#{Dir.home}/japanese/data/edict.txt") { |l|
  c = l.scan(/^([#{kanji}]{2}) \[(.*?)\] \/(.*?)\//)
  next if c == []
  c[2] = c[2].sub(/^(\([^)]*\) )*/, "").sub(/ \([^)]*\)$/, "").sub(/^to /, "")
  next unless rtk[c[0][0]] + " " + rtk[c[0][1]] == c[2]
  puts c[1].rjust(5, "\u{3000}") + " " + c[0] + " " + c[2]
}

The output looks like this:

  とくぎ 特技 special skill
  のうは 脳波 brain waves
 さんそう 山荘 mountain villa

hiragana-audio.sh

cd "${0%/*}"

i=1000
while read l; do
  IFS=\; c=($l)
  f=~/japanese/files/pod/${c[1]}.mp3
  [[ -e $f ]] || continue
  ffmpeg -v 0 -i $f -ar 22050 /tmp/$((i++)).aif < /dev/null
  say "[[slnc 0]][[volm 0.6]]${c[2]}[[slnc 800]]" -v alex -o /tmp/$((i++)).aif
done < <(gshuf ~/japanese/review/hiragana-vocabulary.txt)
sox /tmp/*.aif /tmp/0.aif
ffmpeg -i /tmp/0.aif -c:a libfaac -q 150 -y ~/japanese/review/hiragana-audio.m4a
rm /tmp/*.aif

episodes.rb

Dir.chdir(__dir__)

require "./furigana"

edict = {}
IO.read("../data/JMdict_e").scan(/<entry>.*?<\/entry>/m).reverse.each { |entry|
  keb = entry[/(?<=<keb>).*(?=<\/keb>)/] || next
  reb = entry[/(?<=<reb>).*(?=<\/reb>)/]
  gloss = entry[/(?<=<gloss>).*(?=<\/gloss>)/]
  gloss = gloss.sub(/^\(.*?\) */, "").sub(/ \(.*?\)$/, "").sub(/^to /, "")
  edict[keb] = [reb, gloss]
}

freq = Hash[IO.read("#{Dir.home}/Sites/jp/word-frequency.txt").split[20000..200000].map { |w| [w, nil] }]
rtk = IO.readlines("#{Dir.home}/Sites/jp/rtk-keywords.txt").grep(/^[^#]/).take(2200).map { |l| l[0] }.join
learned = Dir["../data/words[012].txt"].map { |f| IO.read(f).split }.flatten
words = `for f in ~/desktop/*.srt; do mecab -F '%t %f[6]\\n' "$f"; done | awk '$1=="2"{print $2}'`
output = ""

(words.split - learned).uniq.shuffle.each { |word|
  next unless freq.key?(word)
  next unless word =~ /^[\u{3040}-\u{309f}#{rtk}]{2,}$/
  reb, gloss = edict[word] || next
  next unless gloss =~ /^[a-z -]{1,18}$/
  output << "<div onclick=\"highlight(this)\"><div>#{furigana(word, reb)}</div><div>#{gloss}</div></div>\n"
}

exit if output == ""
f = "../review/episodes.html"
IO.write(f, IO.read(f).sub(/<div.*<\/div>/m, output))
system("open", f)

This script generates an HTML file for reviewing uncommon words in subtitle files or other Japanese text.

edict-subs.rb

freq = {}
IO.read("#{Dir.home}/Sites/jp/word-frequency.txt").split[10000..-1].each { |w|
  freq[w] = nil
}

edict = {}
IO.read("../data/JMdict_e.xml").scan(/<entry>.*?<\/entry>/m).each { |entry|
  keb = entry[/(?<=<keb>).*(?=<\/keb>)/] || next
  next if edict[keb]
  next unless freq.key?(keb)
  gloss = entry[/(?<=<gloss>).*(?=<\/gloss>)/]
  gloss = gloss.sub(/^\(.*?\) */, "").sub(/ \(.*?\)$/, "").sub(/^to /, "")
  next if gloss.length > 20
  edict[keb] = gloss
}

Dir["#{Dir.home}/Desktop/*.srt"].each { |f|
  out = ""
  IO.read(f).gsub("\r", "").split("\n\n").each { |s|
    id, time, subs = s.split("\n", 3)
    out << id + "\n" + time + "\n"
    IO.popen(["mecab", "-F%M\t%f[6]\n", "-U%M\n", "-E", "EOS\n"], "r+") { |io|
      io.puts subs
      io.close_write
      io.read
    }.split("\n").each { |morpheme|
      if morpheme == "EOS"
        out << "\n"
      elsif morpheme =~ /(.+)\t(.+)/
        if english = edict[$2]
          out << " " + $1 + " " + english + " "
        else
          out << $1
        end
      else
        out << morpheme
      end
    }
    out << "\n"
  }
  IO.write(f, out.gsub!(/^ | $/, ""))
}

This script modifies Japanese srt subtitles to add translations after uncommon words:

778
01:03:09,196 --> 01:03:13,200
上巻 first volume 下巻 last volume じゃなくて
上中下だって。

rtk

#!/usr/bin/env ruby

rtk = Hash[IO.readlines("#{Dir.home}/Sites/jp/rtk-keywords.txt").grep(/^[^#]/).take(3000).map { |l|
  l.split(";").values_at(2, 0)
}]
puts (ARGV[0] || STDIN.read).split(/[,\n]/m).map { |k| rtk[k] }.join

This script converts RTK keywords to kanji:

$ rtk Sino-,character
漢字

kanjidic-tsv.rb

I used this script to generate the first eight columns in kanji.txt.

require "nokogiri"

xml = IO.read("#{Dir.home}/japanese/data/kanjidic2.xml")

Nokogiri.XML(xml).css("character").each { |e|
  puts [
    e.css("literal").text,
    e.css("reading[r_type='ja_on']").map(&:text).join(" "),
    e.css("reading[r_type='ja_kun']").map(&:text).join(" "),
    e.css("nanori").map(&:text).join(" "),
    e.css("meaning:not([m_lang])").map(&:text).join(", "),
    e.css("grade").text,
    e.css("stroke_count").text,
    e.css("rad_value[rad_type='classical']").text
  ].join("\t")
}

rtktype

#!/usr/bin/env bash

cd "${0%/*}"
LC_ALL=en_US.UTF-8
trap onexit EXIT
clear

onexit() {
  echo
  clear
  printf %s "$out" | while read l; do
    [[ ${l:11:1} = x ]] && printf %s "${l:13:1} ${l:15} "
  done
  printf '\b\n'
}

IFS=$';\n' read -d '' -a keywords < <(grep -v '^#' ~/Sites/jp/rtk-keywords.txt | head -n2200 | cut -d\; -f1,3)

while :; do
  let i++
  framenumber=$(($RANDOM$RANDOM%(${#keywords[@]}/2)))
  kanji=${keywords[$framenumber*2]}
  keyword=${keywords[$framenumber*2+1]}
  pad=$(printf %$(($(tput cols)/2-7))s)
  read -ep "$pad$kanji " -n ${#keyword} answer
  if [[ $answer = $keyword ]]; then
    status=o
  else
    clear
    echo "$pad$kanji $keyword"
    sleep 2
    clear
    status=x
    read -d '' -t0.001 -n99999
    printf '\e[2K\r'
  fi
  repetition="$(date +%s) $status $kanji $keyword"
  echo "$repetition" >> repetitions.txt
  out+="$repetition"$'\n'
  [[ $i = 50 ]] && exit
done

vocabulary-two-kanji.rb

edict = {}
IO.read("#{Dir.home}/japanese/data/JMdict_e").scan(/<entry>.*?<\/entry>/m).each { |entry|
  keb = entry[/(?<=<keb>).*(?=<\/keb>)/] || next
  next if edict[keb]
  reb = entry[/(?<=<reb>).*(?=<\/reb>)/]
  gloss = entry[/(?<=<gloss>).*(?=<\/gloss>)/]
  gloss = gloss.sub(/^\(.*?\) */, "").sub(/ \(.*?\)$/, "").sub(/^to /, "")
  edict[keb] = [reb, gloss]
}

rtk = {}
IO.read("#{Dir.home}/Sites/jp/rtk-keywords.txt").split("\n").grep(/^[^#]/).take(2200).each { |l|
  c = l.split(";")
  rtk[c[0]] = c[2]
}
kanji = rtk.keys.join

learned = {}
Dir["#{Dir.home}/japanese/data/words[012].txt"].each { |f|
  IO.foreach(f) { |w| learned[w.chomp] = nil }
}

i = 10000
IO.readlines("#{Dir.home}/Sites/jp/word-frequency.txt")[30000..100000].map { |l|
  l.split(";")[0]
}.each { |word|
  next unless word =~ /^[#{kanji}]{2}$/
  next if learned.key?(word)
  next unless edictword = edict[word]
  next unless edictword[1] =~ /^[a-z '-]{1,20}/
  pod = "#{Dir.home}/japanese/files/pod/#{word} #{edictword[0]}.mp3"
  next unless File.exist?(pod)
  `ffmpeg -v quiet -i '#{pod}' -ar 22050 -y /tmp/two/#{i += 1}\\ #{word}\\ #{edictword[0]}.aif`
  system("say", "[[volm 0.45]]#{rtk[word[0]]} #{rtk[word[1]]}[[slnc 200]]#{edictword[1]}", "-o", "/tmp/two/#{i += 1}.aif")
  `ffmpeg -v quiet -i '#{pod}' -ar 22050 -y /tmp/two/#{i += 1}.aif`
}

`sox /tmp/*.aif ~/Japanese/review/freqtwoaudio.aif
rm /tmp/*.aif`

printable-homophones.rb

edict = {}
IO.read("#{Dir.home}/japanese/data/JMdict_e").scan(/<entry>.*?<\/entry>/m).each { |entry|
  keb = entry[/(?<=<keb>).*(?=<\/keb>)/] || next
  next if edict[keb]
  reb = entry[/(?<=<reb>).*(?=<\/reb>)/]
  gloss = entry[/(?<=<gloss>).*(?=<\/gloss>)/]
  gloss = gloss.sub(/^\(.*?\) */, "").sub(/ \(.*?\)$/, "").sub(/^to /, "")
  edict[keb] = [reb, gloss]
}

freq = {}
IO.foreach("#{Dir.home}/sites/jp/word-frequency.txt").grep(/^[^#]/)[10000..50000].each { |l|
  freq[l.split(";")[0]] = nil
}

rtk = Hash[IO.foreach("#{Dir.home}/sites/jp/rtk-keywords.txt").grep(/^[^#]/).take(2200).map { |l|
  l.split(";").values_at(0, 2)
}]

words = []
IO.read("#{Dir.home}/japanese/data/edict.txt").scan(/^([#{rtk.keys.join}]{2}) \[(.*?)\] \/(.*?)\//).each { |w|
  next unless freq.key?(w[0])
  w[2] = w[2].sub(/^(\(.*?\) )*/, "").sub(/ \([^)]*?\)$/, "").sub(/^to /, "")
  next unless w[2] =~ /^[a-z -]{1,20}$/
  words << w
}

kana = words.transpose[1].uniq
output = ""
audio = []

kana.shuffle.each { |k|
  found = words.select { |w| w[1] == k }
  found = found.select { |w| found.transpose[2].count(w[2]) == 1 }.sample(4)
  next unless found.size >= 2
  output << "<div><span>#{k}</span>\n"
  found.each { |f|
    output << "<div><div>#{f[0]}</div><div>#{f[2]}</div></div>\n"
  }
  output << "</div>\n"
}

exit if output == ""

f = "#{Dir.home}/Sites/jp/printable-homophones.html"
IO.write(f, IO.read(f).sub(/<div.*\/div>\n/m, output))

printable-two-kanji.rb

edict = {}
IO.read("#{Dir.home}/japanese/data/JMdict_e").scan(/<entry>.*?<\/entry>/m).each { |entry|
  keb = entry[/(?<=<keb>).*(?=<\/keb>)/] || next
  next if edict[keb]
  reb = entry[/(?<=<reb>).*(?=<\/reb>)/]
  gloss = entry[/(?<=<gloss>).*(?=<\/gloss>)/]
  gloss = gloss.sub(/^\(.*?\) */, "").sub(/ \(.*?\)$/, "").sub(/^to /, "")
  edict[keb] = [reb, gloss]
}

rtk = IO.readlines("#{Dir.home}/Sites/jp/rtk-keywords.txt").grep(/^[^#]/).take(2200).map { |l| l[0] }.join
freq = IO.read("#{Dir.home}/Sites/jp/word-frequency.txt").scan(/^[^#;]+/)[15000..40000]
core = IO.read("#{Dir.home}/Sites/jp/core-6000.txt").scan(/^[^#;]+/)
output = ""

(freq - core).shuffle.each { |word|
  next unless word =~ /^[#{rtk}]{2}$/
  next unless edictword = edict[word]
  next unless edictword[1] =~ /^[a-z -]{1,20}$/
  output << "<div><div>#{edictword[0]}</div><div>#{word}</div><div>#{edictword[1]}</div></div>\n"
}

f = "#{Dir.home}/Sites/jp/printable-two-kanji.html"
IO.write(f, IO.read(f).sub(/<div.*<\/div>\n/m, output))