include Java require "open-uri" require "nkf" require "pp" P_CLASS_PATH = "java.class.path" P_SEN_HOME = "sen.home" SEN_HOME = "/home/yuichiro/sen-1.2.2.1" Java.java.lang.System.set_property(P_SEN_HOME, SEN_HOME) cp = Java.java.lang.System.get_property(P_CLASS_PATH) p cp # sen_jars = Dir::glob(File::join(SEN_HOME, "lib", "*.jar")).join(File::PATH_SEPARATOR) # p sen_jars # Java.java.lang.System.set_property(P_CLASS_PATH, cp + File::PATH_SEPARATOR + sen_jars) # p Java.java.lang.System.get_property(P_CLASS_PATH) tagger = Java::net.java.sen.StringTagger.get_instance str = "" begin r = java.io.BufferedReader.new(java.io.InputStreamReader.new(java.io.FileInputStream.new("/home/yuichiro/16.html"), "EUC-JP")) while(x = r.readLine) str << x.chomp.strip << "\n" end ensure r.close if r end puts str.size class Pfx attr_accessor :fst, :snd def initialize(fst, snd) @fst = fst @snd = snd end def hash (@fst+@snd).hash end def eql?(x) x.fst.eql?(@fst) && x.snd.eql?(@snd) end end dic = Hash.new{|h,key| h[key] = []} str.scan(/\n3\/16.*\n^(.*)$/) do |x| cur = ['BOS'] toks = tagger.analyze(x[0]) if toks toks.each do |t| if cur.size > 1 pfx = Pfx.new(*cur[-2,2]) dic[pfx].push(t.get_surface) cur = cur[-2,2] end cur.push(t.get_surface) end end dic[Pfx.new(*(cur[-2,2]))].push("EOS") if cur.size > 1 end dic.keys.each do |pfx| next unless pfx && pfx.fst == 'BOS' print pfx.snd while(sfxes = dic[pfx]) sfx = sfxes[rand(sfxes.length)] break if sfx == "EOS" print sfx pfx = Pfx.new(pfx.snd, sfx) end puts end