Class: TinyJapaneseSegmenter

Inherits:
Object
  • Object
show all
Defined in:
lib/tiny_japanese_segmenter.rb

Overview

Ruby port of chasen.org/~taku/software/TinySegmenter/tiny_segmenter-0.2.js This is esstentially a trained machine learning model used to segment words in Japanese. Discourse core uses it for “best effort” segmentation of Japanese text for search.

Constant Summary collapse

CHARTYPE =
{
  "[一二三四五六七八九十百千万億兆]" => "M",
  "[一-龠々〆ヵヶ]" => "H",
  "[ぁ-ん]" => "I",
  "[ァ-ヴーア-ン゙ー]" => "K",
  "[a-zA-Za-zA-Z]" => "A",
  "[0-90-9]" => "N",
}.map { |pattern, value| [Regexp.compile(pattern), value] }
BIAS =
-322
BC1 =
{ "HH" => 6, "II" => 2461, "KH" => 406, "OH" => -1378 }
BC2 =
{
  "AA" => -3267,
  "AI" => 2744,
  "AN" => -878,
  "HH" => -4070,
  "HM" => -1711,
  "HN" => 4012,
  "HO" => 3761,
  "IA" => 1327,
  "IH" => -1184,
  "II" => -1332,
  "IK" => 1721,
  "IO" => 5492,
  "KI" => 3831,
  "KK" => -8741,
  "MH" => -3132,
  "MK" => 3334,
  "OO" => -2920,
}
BC3 =
{
  "HH" => 996,
  "HI" => 626,
  "HK" => -721,
  "HN" => -1307,
  "HO" => -836,
  "IH" => -301,
  "KK" => 2762,
  "MK" => 1079,
  "MM" => 4034,
  "OA" => -1652,
  "OH" => 266,
}
BP1 =
{ "BB" => 295, "OB" => 304, "OO" => -125, "UB" => 352 }
BP2 =
{ "BO" => 60, "OO" => -1762 }
BQ1 =
{
  "BHH" => 1150,
  "BHM" => 1521,
  "BII" => -1158,
  "BIM" => 886,
  "BMH" => 1208,
  "BNH" => 449,
  "BOH" => -91,
  "BOO" => -2597,
  "OHI" => 451,
  "OIH" => -296,
  "OKA" => 1851,
  "OKH" => -1020,
  "OKK" => 904,
  "OOO" => 2965,
}
BQ2 =
{
  "BHH" => 118,
  "BHI" => -1159,
  "BHM" => 466,
  "BIH" => -919,
  "BKK" => -1720,
  "BKO" => 864,
  "OHH" => -1139,
  "OHM" => -181,
  "OIH" => 153,
  "UHI" => -1146,
}
BQ3 =
{
  "BHH" => -792,
  "BHI" => 2664,
  "BII" => -299,
  "BKI" => 419,
  "BMH" => 937,
  "BMM" => 8335,
  "BNN" => 998,
  "BOH" => 775,
  "OHH" => 2174,
  "OHM" => 439,
  "OII" => 280,
  "OKH" => 1798,
  "OKI" => -793,
  "OKO" => -2242,
  "OMH" => -2402,
  "OOO" => 11_699,
}
BQ4 =
{
  "BHH" => -3895,
  "BIH" => 3761,
  "BII" => -4654,
  "BIK" => 1348,
  "BKK" => -1806,
  "BMI" => -3385,
  "BOO" => -12_396,
  "OAH" => 926,
  "OHH" => 266,
  "OHK" => -2036,
  "ONN" => -973,
}
BW1 =
{
  ",と" => 660,
  ",同" => 727,
  "B1あ" => 1404,
  "B1同" => 542,
  "、と" => 660,
  "、同" => 727,
  "」と" => 1682,
  "あっ" => 1505,
  "いう" => 1743,
  "いっ" => -2055,
  "いる" => 672,
  "うし" => -4817,
  "うん" => 665,
  "から" => 3472,
  "がら" => 600,
  "こう" => -790,
  "こと" => 2083,
  "こん" => -1262,
  "さら" => -4143,
  "さん" => 4573,
  "した" => 2641,
  "して" => 1104,
  "すで" => -3399,
  "そこ" => 1977,
  "それ" => -871,
  "たち" => 1122,
  "ため" => 601,
  "った" => 3463,
  "つい" => -802,
  "てい" => 805,
  "てき" => 1249,
  "でき" => 1127,
  "です" => 3445,
  "では" => 844,
  "とい" => -4915,
  "とみ" => 1922,
  "どこ" => 3887,
  "ない" => 5713,
  "なっ" => 3015,
  "など" => 7379,
  "なん" => -1113,
  "にし" => 2468,
  "には" => 1498,
  "にも" => 1671,
  "に対" => -912,
  "の一" => -501,
  "の中" => 741,
  "ませ" => 2448,
  "まで" => 1711,
  "まま" => 2600,
  "まる" => -2155,
  "やむ" => -1947,
  "よっ" => -2565,
  "れた" => 2369,
  "れで" => -913,
  "をし" => 1860,
  "を見" => 731,
  "亡く" => -1886,
  "京都" => 2558,
  "取り" => -2784,
  "大き" => -2604,
  "大阪" => 1497,
  "平方" => -2314,
  "引き" => -1336,
  "日本" => -195,
  "本当" => -2423,
  "毎日" => -2113,
  "目指" => -724,
  "B1あ" => 1404,
  "B1同" => 542,
  "」と" => 1682,
}
BW2 =
{
  ".." => -11_822,
  "11" => -669,
  "――" => -5730,
  "−−" => -13_175,
  "いう" => -1609,
  "うか" => 2490,
  "かし" => -1350,
  "かも" => -602,
  "から" => -7194,
  "かれ" => 4612,
  "がい" => 853,
  "がら" => -3198,
  "きた" => 1941,
  "くな" => -1597,
  "こと" => -8392,
  "この" => -4193,
  "させ" => 4533,
  "され" => 13_168,
  "さん" => -3977,
  "しい" => -1819,
  "しか" => -545,
  "した" => 5078,
  "して" => 972,
  "しな" => 939,
  "その" => -3744,
  "たい" => -1253,
  "たた" => -662,
  "ただ" => -3857,
  "たち" => -786,
  "たと" => 1224,
  "たは" => -939,
  "った" => 4589,
  "って" => 1647,
  "っと" => -2094,
  "てい" => 6144,
  "てき" => 3640,
  "てく" => 2551,
  "ては" => -3110,
  "ても" => -3065,
  "でい" => 2666,
  "でき" => -1528,
  "でし" => -3828,
  "です" => -4761,
  "でも" => -4203,
  "とい" => 1890,
  "とこ" => -1746,
  "とと" => -2279,
  "との" => 720,
  "とみ" => 5168,
  "とも" => -3941,
  "ない" => -2488,
  "なが" => -1313,
  "など" => -6509,
  "なの" => 2614,
  "なん" => 3099,
  "にお" => -1615,
  "にし" => 2748,
  "にな" => 2454,
  "によ" => -7236,
  "に対" => -14_943,
  "に従" => -4688,
  "に関" => -11_388,
  "のか" => 2093,
  "ので" => -7059,
  "のに" => -6041,
  "のの" => -6125,
  "はい" => 1073,
  "はが" => -1033,
  "はず" => -2532,
  "ばれ" => 1813,
  "まし" => -1316,
  "まで" => -6621,
  "まれ" => 5409,
  "めて" => -3153,
  "もい" => 2230,
  "もの" => -10_713,
  "らか" => -944,
  "らし" => -1611,
  "らに" => -1897,
  "りし" => 651,
  "りま" => 1620,
  "れた" => 4270,
  "れて" => 849,
  "れば" => 4114,
  "ろう" => 6067,
  "われ" => 7901,
  "を通" => -11_877,
  "んだ" => 728,
  "んな" => -4115,
  "一人" => 602,
  "一方" => -1375,
  "一日" => 970,
  "一部" => -1051,
  "上が" => -4479,
  "会社" => -1116,
  "出て" => 2163,
  "分の" => -7758,
  "同党" => 970,
  "同日" => -913,
  "大阪" => -2471,
  "委員" => -1250,
  "少な" => -1050,
  "年度" => -8669,
  "年間" => -1626,
  "府県" => -2363,
  "手権" => -1982,
  "新聞" => -4066,
  "日新" => -722,
  "日本" => -7068,
  "日米" => 3372,
  "曜日" => -601,
  "朝鮮" => -2355,
  "本人" => -2697,
  "東京" => -1543,
  "然と" => -1384,
  "社会" => -1276,
  "立て" => -990,
  "第に" => -1612,
  "米国" => -4268,
  "11" => -669,
}
BW3 =
{
  "あた" => -2194,
  "あり" => 719,
  "ある" => 3846,
  "い." => -1185,
  "い。" => -1185,
  "いい" => 5308,
  "いえ" => 2079,
  "いく" => 3029,
  "いた" => 2056,
  "いっ" => 1883,
  "いる" => 5600,
  "いわ" => 1527,
  "うち" => 1117,
  "うと" => 4798,
  "えと" => 1454,
  "か." => 2857,
  "か。" => 2857,
  "かけ" => -743,
  "かっ" => -4098,
  "かに" => -669,
  "から" => 6520,
  "かり" => -2670,
  "が," => 1816,
  "が、" => 1816,
  "がき" => -4855,
  "がけ" => -1127,
  "がっ" => -913,
  "がら" => -4977,
  "がり" => -2064,
  "きた" => 1645,
  "けど" => 1374,
  "こと" => 7397,
  "この" => 1542,
  "ころ" => -2757,
  "さい" => -714,
  "さを" => 976,
  "し," => 1557,
  "し、" => 1557,
  "しい" => -3714,
  "した" => 3562,
  "して" => 1449,
  "しな" => 2608,
  "しま" => 1200,
  "す." => -1310,
  "す。" => -1310,
  "する" => 6521,
  "ず," => 3426,
  "ず、" => 3426,
  "ずに" => 841,
  "そう" => 428,
  "た." => 8875,
  "た。" => 8875,
  "たい" => -594,
  "たの" => 812,
  "たり" => -1183,
  "たる" => -853,
  "だ." => 4098,
  "だ。" => 4098,
  "だっ" => 1004,
  "った" => -4748,
  "って" => 300,
  "てい" => 6240,
  "てお" => 855,
  "ても" => 302,
  "です" => 1437,
  "でに" => -1482,
  "では" => 2295,
  "とう" => -1387,
  "とし" => 2266,
  "との" => 541,
  "とも" => -3543,
  "どう" => 4664,
  "ない" => 1796,
  "なく" => -903,
  "など" => 2135,
  "に," => -1021,
  "に、" => -1021,
  "にし" => 1771,
  "にな" => 1906,
  "には" => 2644,
  "の," => -724,
  "の、" => -724,
  "の子" => -1000,
  "は," => 1337,
  "は、" => 1337,
  "べき" => 2181,
  "まし" => 1113,
  "ます" => 6943,
  "まっ" => -1549,
  "まで" => 6154,
  "まれ" => -793,
  "らし" => 1479,
  "られ" => 6820,
  "るる" => 3818,
  "れ," => 854,
  "れ、" => 854,
  "れた" => 1850,
  "れて" => 1375,
  "れば" => -3246,
  "れる" => 1091,
  "われ" => -605,
  "んだ" => 606,
  "んで" => 798,
  "カ月" => 990,
  "会議" => 860,
  "入り" => 1232,
  "大会" => 2217,
  "始め" => 1681,
  "" => 965,
  "新聞" => -5055,
  "日," => 974,
  "日、" => 974,
  "社会" => 2024,
  "カ月" => 990,
}
TC1 =
{
  "AAA" => 1093,
  "HHH" => 1029,
  "HHM" => 580,
  "HII" => 998,
  "HOH" => -390,
  "HOM" => -331,
  "IHI" => 1169,
  "IOH" => -142,
  "IOI" => -1015,
  "IOM" => 467,
  "MMH" => 187,
  "OOI" => -1832,
}
TC2 =
{
  "HHO" => 2088,
  "HII" => -1023,
  "HMM" => -1154,
  "IHI" => -1965,
  "KKH" => 703,
  "OII" => -2649,
}
TC3 =
{
  "AAA" => -294,
  "HHH" => 346,
  "HHI" => -341,
  "HII" => -1088,
  "HIK" => 731,
  "HOH" => -1486,
  "IHH" => 128,
  "IHI" => -3041,
  "IHO" => -1935,
  "IIH" => -825,
  "IIM" => -1035,
  "IOI" => -542,
  "KHH" => -1216,
  "KKA" => 491,
  "KKH" => -1217,
  "KOK" => -1009,
  "MHH" => -2694,
  "MHM" => -457,
  "MHO" => 123,
  "MMH" => -471,
  "NNH" => -1689,
  "NNO" => 662,
  "OHO" => -3393,
}
TC4 =
{
  "HHH" => -203,
  "HHI" => 1344,
  "HHK" => 365,
  "HHM" => -122,
  "HHN" => 182,
  "HHO" => 669,
  "HIH" => 804,
  "HII" => 679,
  "HOH" => 446,
  "IHH" => 695,
  "IHO" => -2324,
  "IIH" => 321,
  "III" => 1497,
  "IIO" => 656,
  "IOO" => 54,
  "KAK" => 4845,
  "KKA" => 3386,
  "KKK" => 3065,
  "MHH" => -405,
  "MHI" => 201,
  "MMH" => -241,
  "MMM" => 661,
  "MOM" => 841,
}
TQ1 =
{
  "BHHH" => -227,
  "BHHI" => 316,
  "BHIH" => -132,
  "BIHH" => 60,
  "BIII" => 1595,
  "BNHH" => -744,
  "BOHH" => 225,
  "BOOO" => -908,
  "OAKK" => 482,
  "OHHH" => 281,
  "OHIH" => 249,
  "OIHI" => 200,
  "OIIH" => -68,
}
TQ2 =
{ "BIHH" => -1401, "BIII" => -1033, "BKAK" => -543, "BOOO" => -5591 }
TQ3 =
{
  "BHHH" => 478,
  "BHHM" => -1073,
  "BHIH" => 222,
  "BHII" => -504,
  "BIIH" => -116,
  "BIII" => -105,
  "BMHI" => -863,
  "BMHM" => -464,
  "BOMH" => 620,
  "OHHH" => 346,
  "OHHI" => 1729,
  "OHII" => 997,
  "OHMH" => 481,
  "OIHH" => 623,
  "OIIH" => 1344,
  "OKAK" => 2792,
  "OKHH" => 587,
  "OKKA" => 679,
  "OOHH" => 110,
  "OOII" => -685,
}
TQ4 =
{
  "BHHH" => -721,
  "BHHM" => -3604,
  "BHII" => -966,
  "BIIH" => -607,
  "BIII" => -2181,
  "OAAA" => -2763,
  "OAKK" => 180,
  "OHHH" => -294,
  "OHHI" => 2446,
  "OHHO" => 480,
  "OHIH" => -1573,
  "OIHH" => 1935,
  "OIHI" => -493,
  "OIIH" => 626,
  "OIII" => -4007,
  "OKAK" => -8156,
}
TW1 =
{ "につい" => -4681, "東京都" => 2026 }
TW2 =
{
  "ある程" => -2049,
  "いった" => -1256,
  "ころが" => -2434,
  "しょう" => 3873,
  "その後" => -4430,
  "だって" => -1049,
  "ていた" => 1833,
  "として" => -4657,
  "ともに" => -4517,
  "もので" => 1882,
  "一気に" => -792,
  "初めて" => -1512,
  "同時に" => -8097,
  "大きな" => -1255,
  "対して" => -2721,
  "社会党" => -3216,
}
TW3 =
{
  "いただ" => -1734,
  "してい" => 1314,
  "として" => -4314,
  "につい" => -5483,
  "にとっ" => -5989,
  "に当た" => -6247,
  "ので," => -727,
  "ので、" => -727,
  "のもの" => -600,
  "れから" => -3752,
  "十二月" => -2287,
}
TW4 =
{
  "いう." => 8576,
  "いう。" => 8576,
  "からな" => -2348,
  "してい" => 2958,
  "たが," => 1516,
  "たが、" => 1516,
  "ている" => 1538,
  "という" => 1349,
  "ました" => 5543,
  "ません" => 1097,
  "ようと" => -4258,
  "よると" => 5865,
}
UC1 =
{ "A" => 484, "K" => 93, "M" => 645, "O" => -505 }
UC2 =
{ "A" => 819, "H" => 1059, "I" => 409, "M" => 3987, "N" => 5775, "O" => 646 }
UC3 =
{ "A" => -1370, "I" => 2311 }
UC4 =
{
  "A" => -2643,
  "H" => 1809,
  "I" => -1032,
  "K" => -3450,
  "M" => 3565,
  "N" => 3876,
  "O" => 6646,
}
UC5 =
{ "H" => 313, "I" => -1238, "K" => -799, "M" => 539, "O" => -831 }
UC6 =
{ "H" => -506, "I" => -253, "K" => 87, "M" => 247, "O" => -387 }
UP1 =
{ "O" => -214 }
UP2 =
{ "B" => 69, "O" => 935 }
UP3 =
{ "B" => 189 }
UQ1 =
{
  "BH" => 21,
  "BI" => -12,
  "BK" => -99,
  "BN" => 142,
  "BO" => -56,
  "OH" => -95,
  "OI" => 477,
  "OK" => 410,
  "OO" => -2422,
}
UQ2 =
{ "BH" => 216, "BI" => 113, "OK" => 1759 }
UQ3 =
{
  "BA" => -479,
  "BH" => 42,
  "BI" => 1913,
  "BK" => -7198,
  "BM" => 3160,
  "BN" => 6427,
  "BO" => 14_761,
  "OI" => -827,
  "ON" => -3212,
}
UW1 =
{
  "," => 156,
  "" => 156,
  "" => -463,
  "" => -941,
  "" => -127,
  "" => -553,
  "" => 121,
  "" => 505,
  "" => -201,
  "" => -547,
  "" => -123,
  "" => -789,
  "" => -185,
  "" => -847,
  "" => -466,
  "" => -470,
  "" => 182,
  "" => -292,
  "" => 208,
  "" => 169,
  "" => -446,
  "" => -137,
  "" => -135,
  "" => -402,
  "" => -268,
  "" => -912,
  "" => 871,
  "" => -460,
  "" => 561,
  "" => 729,
  "" => -411,
  "" => -141,
  "" => 361,
  "" => -408,
  "" => -386,
  "" => -718,
  "" => -463,
  "" => -135,
}
UW2 =
{
  "," => -829,
  "" => -829,
  "" => 892,
  "" => -645,
  "" => 3145,
  "" => -538,
  "" => 505,
  "" => 134,
  "" => -502,
  "" => 1454,
  "" => -856,
  "" => -412,
  "" => 1141,
  "" => 878,
  "" => 540,
  "" => 1529,
  "" => -675,
  "" => 300,
  "" => -1011,
  "" => 188,
  "" => 1837,
  "" => -949,
  "" => -291,
  "" => -268,
  "" => -981,
  "" => 1273,
  "" => 1063,
  "" => -1764,
  "" => 130,
  "" => -409,
  "" => -1273,
  "" => 1261,
  "" => 600,
  "" => -1263,
  "" => -402,
  "" => 1639,
  "" => -579,
  "" => -694,
  "" => 571,
  "" => -2516,
  "" => 2095,
  "" => -587,
  "" => 306,
  "" => 568,
  "" => 831,
  "" => -758,
  "" => -2150,
  "" => -302,
  "" => -968,
  "" => -861,
  "" => 492,
  "" => -123,
  "" => 978,
  "" => 362,
  "" => 548,
  "" => -3025,
  "" => -1566,
  "" => -3414,
  "" => -422,
  "" => -1769,
  "" => -865,
  "" => -483,
  "" => -1519,
  "" => 760,
  "" => 1023,
  "" => -2009,
  "" => -813,
  "" => -1060,
  "" => 1067,
  "" => -1519,
  "" => -1033,
  "" => 1522,
  "" => -1355,
  "" => -1682,
  "" => -1815,
  "" => -1462,
  "" => -630,
  "" => -1843,
  "" => -1650,
  "" => -931,
  "" => -665,
  "" => -2378,
  "" => -180,
  "" => -1740,
  "" => 752,
  "" => 529,
  "" => -1584,
  "" => -242,
  "" => -1165,
  "" => -763,
  "" => 810,
  "" => 509,
  "" => -1353,
  "" => 838,
  "西" => -744,
  "" => -3874,
  "調" => 1010,
  "" => 1198,
  "" => 3041,
  "" => 1758,
  "" => -1257,
  "" => -645,
  "" => 3145,
  "" => 831,
  "" => -587,
  "" => 306,
  "" => 568,
}
UW3 =
{
  "," => 4889,
  "1" => -800,
  "" => -1723,
  "" => 4889,
  "" => -2311,
  "" => 5827,
  "" => 2670,
  "" => -3573,
  "" => -2696,
  "" => 1006,
  "" => 2342,
  "" => 1983,
  "" => -4864,
  "" => -1163,
  "" => 3271,
  "" => 1004,
  "" => 388,
  "" => 401,
  "" => -3552,
  "" => -3116,
  "" => -1058,
  "" => -395,
  "" => 584,
  "" => 3685,
  "" => -5228,
  "" => 842,
  "" => -521,
  "" => -1444,
  "" => -1081,
  "" => 6167,
  "" => 2318,
  "" => 1691,
  "" => -899,
  "" => -2788,
  "" => 2745,
  "" => 4056,
  "" => 4555,
  "" => -2171,
  "" => -1798,
  "" => 1199,
  "" => -5516,
  "" => -4384,
  "" => -120,
  "" => 1205,
  "" => 2323,
  "" => -788,
  "" => -202,
  "" => 727,
  "" => 649,
  "" => 5905,
  "" => 2773,
  "" => -1207,
  "" => 6620,
  "" => -518,
  "" => 551,
  "" => 1319,
  "" => 874,
  "" => -1350,
  "" => 521,
  "" => 1109,
  "" => 1591,
  "" => 2201,
  "" => 278,
  "" => -3794,
  "" => -1619,
  "" => -1759,
  "" => -2087,
  "" => 3815,
  "" => 653,
  "" => -758,
  "" => -1193,
  "" => 974,
  "" => 2742,
  "" => 792,
  "" => 1889,
  "" => -1368,
  "" => 811,
  "" => 4265,
  "" => -361,
  "" => -2439,
  "" => 4858,
  "" => 3593,
  "" => 1574,
  "" => -3030,
  "" => 755,
  "" => -1880,
  "" => 5807,
  "" => 3095,
  "" => 457,
  "" => 2475,
  "" => 1129,
  "" => 2286,
  "" => 4437,
  "" => 365,
  "" => -949,
  "" => -1872,
  "" => 1327,
  "" => -1038,
  "" => 4646,
  "" => -2309,
  "" => -783,
  "" => -1006,
  "" => 483,
  "" => 1233,
  "" => 3588,
  "" => -241,
  "" => 3906,
  "" => -837,
  "" => 4513,
  "" => 642,
  "" => 1389,
  "" => 1219,
  "" => -241,
  "" => 2016,
  "" => -1356,
  "" => -423,
  "" => -1008,
  "" => 1078,
  "" => -513,
  "" => -3102,
  "" => 1155,
  "" => 3197,
  "" => -1804,
  "" => 2416,
  "" => -1030,
  "" => 1605,
  "" => 1452,
  "" => -2352,
  "" => -3885,
  "" => 1905,
  "" => -1291,
  "" => 1822,
  "" => -488,
  "" => -3973,
  "" => -2013,
  "" => -1479,
  "" => 3222,
  "" => -1489,
  "" => 1764,
  "" => 2099,
  "" => 5792,
  "" => -661,
  "" => -1248,
  "" => -951,
  "" => -937,
  "" => 4125,
  "" => 360,
  "" => 3094,
  "" => 364,
  "" => -805,
  "" => 5156,
  "" => 2438,
  "" => 484,
  "" => 2613,
  "" => -1694,
  "" => -1073,
  "" => 1868,
  "" => -495,
  "" => 979,
  "" => 461,
  "" => -3850,
  "" => -273,
  "" => 914,
  "" => 1215,
  "" => 7313,
  "" => -1835,
  "" => 792,
  "" => 6293,
  "" => -1528,
  "" => 4231,
  "" => 401,
  "" => -960,
  "" => 1201,
  "" => 7767,
  "" => 3066,
  "" => 3663,
  "" => 1384,
  "" => -4229,
  "" => 1163,
  "" => 1255,
  "" => 6457,
  "" => 725,
  "" => -2869,
  "" => 785,
  "" => 1044,
  "調" => -562,
  "" => -733,
  "" => 1777,
  "" => 1835,
  "" => 1375,
  "" => -1504,
  "" => -1136,
  "" => -681,
  "" => 1026,
  "" => 4404,
  "" => 1200,
  "" => 2163,
  "" => 421,
  "" => -1432,
  "" => 1302,
  "" => -1282,
  "" => 2009,
  "" => -1045,
  "" => 2066,
  "" => 1620,
  "" => -800,
  "" => 2670,
  "" => -3794,
  "" => -1350,
  "" => 551,
  "グ" => 1319,
  "" => 874,
  "" => 521,
  "" => 1109,
  "" => 1591,
  "" => 2201,
  "" => 278,
}
UW4 =
{
  "," => 3930,
  "." => 3508,
  "" => -4841,
  "" => 3930,
  "" => 3508,
  "" => 4999,
  "" => 1895,
  "" => 3798,
  "" => -5156,
  "" => 4752,
  "" => -3435,
  "" => -640,
  "" => -2514,
  "" => 2405,
  "" => 530,
  "" => 6006,
  "" => -4482,
  "" => -3821,
  "" => -3788,
  "" => -4376,
  "" => -4734,
  "" => 2255,
  "" => 1979,
  "" => 2864,
  "" => -843,
  "" => -2506,
  "" => -731,
  "" => 1251,
  "" => 181,
  "" => 4091,
  "" => 5034,
  "" => 5408,
  "" => -3654,
  "" => -5882,
  "" => -1659,
  "" => 3994,
  "" => 7410,
  "" => 4547,
  "" => 5433,
  "" => 6499,
  "" => 1853,
  "" => 1413,
  "" => 7396,
  "" => 8578,
  "" => 1940,
  "" => 4249,
  "" => -4134,
  "" => 1345,
  "" => 6665,
  "" => -744,
  "" => 1464,
  "" => 1051,
  "" => -2082,
  "" => -882,
  "" => -5046,
  "" => 4169,
  "" => -2666,
  "" => 2795,
  "" => -1544,
  "" => 3351,
  "" => -2922,
  "" => -9726,
  "" => -14_896,
  "" => -2613,
  "" => -4570,
  "" => -1783,
  "" => 13_150,
  "" => -2352,
  "" => 2145,
  "" => 1789,
  "" => 1287,
  "" => -724,
  "" => -403,
  "" => -1635,
  "" => -881,
  "" => -541,
  "" => -856,
  "" => -3637,
  "" => -4371,
  "" => -11_870,
  "" => -2069,
  "" => 2210,
  "" => 782,
  "" => -190,
  "" => -1768,
  "" => 1036,
  "" => 544,
  "" => 950,
  "" => -1286,
  "" => 530,
  "" => 4292,
  "" => 601,
  "" => -2006,
  "" => -1212,
  "" => 584,
  "" => 788,
  "" => 1347,
  "" => 1623,
  "" => 3879,
  "" => -302,
  "" => -740,
  "" => -2715,
  "" => 776,
  "" => 4517,
  "" => 1013,
  "" => 1555,
  "" => -1834,
  "" => -681,
  "" => -910,
  "" => -851,
  "" => 1500,
  "" => -619,
  "" => -1200,
  "" => 866,
  "" => -1410,
  "" => -2094,
  "" => -1413,
  "" => 1067,
  "" => 571,
  "" => -4802,
  "" => -1397,
  "" => -1057,
  "" => -809,
  "" => 1910,
  "" => -1328,
  "" => -1500,
  "" => -2056,
  "" => -2667,
  "" => 2771,
  "" => 374,
  "" => -4556,
  "" => 456,
  "" => 553,
  "" => 916,
  "" => -1566,
  "" => 856,
  "" => 787,
  "" => 2182,
  "" => 704,
  "" => 522,
  "" => -856,
  "" => 1798,
  "" => 1829,
  "" => 845,
  "" => -9066,
  "" => -485,
  "" => -442,
  "" => -360,
  "" => -1043,
  "" => 5388,
  "" => -2716,
  "" => -910,
  "" => -939,
  "" => -543,
  "" => -735,
  "" => 672,
  "" => -1267,
  "" => -1286,
  "" => -1101,
  "" => -2900,
  "" => 1826,
  "" => 2586,
  "" => 922,
  "" => -3485,
  "" => 2997,
  "" => -867,
  "" => -2112,
  "" => 788,
  "" => 2937,
  "" => 786,
  "" => 2171,
  "" => 1146,
  "" => -1169,
  "" => 940,
  "" => -994,
  "" => 749,
  "" => 2145,
  "" => -730,
  "" => -852,
  "" => -792,
  "" => 792,
  "" => -1184,
  "" => -244,
  "" => -1000,
  "" => 730,
  "" => -1481,
  "" => 1158,
  "" => -1433,
  "" => -3370,
  "" => 929,
  "" => -1291,
  "" => 2596,
  "" => -4866,
  "" => 1192,
  "" => -1100,
  "" => -2213,
  "" => 357,
  "" => -2344,
  "" => -2297,
  "" => -2604,
  "" => -878,
  "" => -1659,
  "" => -792,
  "" => -1984,
  "" => 1749,
  "" => 2120,
  "" => 1895,
  "" => 3798,
  "" => -4371,
  "" => -724,
  "" => -11_870,
  "" => 2145,
  "" => 1789,
  "" => 1287,
  "" => -403,
  "" => -1635,
  "" => -881,
  "" => -541,
  "" => -856,
  "" => -3637,
}
UW5 =
{
  "," => 465,
  "." => -299,
  "1" => -514,
  "E2" => -32_768,
  "]" => -2762,
  "" => 465,
  "" => -299,
  "" => 363,
  "" => 1655,
  "" => 331,
  "" => -503,
  "" => 1199,
  "" => 527,
  "" => 647,
  "" => -421,
  "" => 1624,
  "" => 1971,
  "" => 312,
  "" => -983,
  "" => -1537,
  "" => -1371,
  "" => -852,
  "" => -1186,
  "" => 1093,
  "" => 52,
  "" => 921,
  "" => -18,
  "" => -850,
  "" => -127,
  "" => 1682,
  "" => -787,
  "" => -1224,
  "" => -635,
  "" => -578,
  "" => 1001,
  "" => 502,
  "" => 865,
  "" => 3350,
  "" => 854,
  "" => -208,
  "" => 429,
  "" => 504,
  "" => 419,
  "" => -1264,
  "" => 327,
  "" => 241,
  "" => 451,
  "" => -343,
  "" => -871,
  "" => 722,
  "" => -1153,
  "" => -654,
  "" => 3519,
  "" => -901,
  "" => 848,
  "" => 2104,
  "" => -1296,
  "" => -548,
  "" => 1785,
  "" => -1304,
  "" => -2991,
  "" => 921,
  "" => 1763,
  "" => 872,
  "" => -814,
  "" => 1618,
  "" => -1682,
  "" => 218,
  "" => -4353,
  "" => 932,
  "" => 1356,
  "" => -1508,
  "" => -1347,
  "" => 240,
  "" => -3912,
  "" => -3149,
  "" => 1319,
  "" => -1052,
  "" => -4003,
  "" => -997,
  "" => -278,
  "" => -813,
  "" => 1955,
  "" => -2233,
  "" => 663,
  "" => -1073,
  "" => 1219,
  "" => -1018,
  "" => -368,
  "" => 786,
  "" => 1191,
  "" => 2368,
  "" => -689,
  "" => -514,
  "E2" => -32_768,
  "" => 363,
  "" => 241,
  "" => 451,
  "" => -343,
}
UW6 =
{
  "," => 227,
  "." => 808,
  "1" => -270,
  "E1" => 306,
  "" => 227,
  "" => 808,
  "" => -307,
  "" => 189,
  "" => 241,
  "" => -73,
  "" => -121,
  "" => -200,
  "" => 1782,
  "" => 383,
  "" => -428,
  "" => 573,
  "" => -1014,
  "" => 101,
  "" => -105,
  "" => -253,
  "" => -149,
  "" => -417,
  "" => -236,
  "" => -206,
  "" => 187,
  "" => -135,
  "" => 195,
  "" => -673,
  "" => -496,
  "" => -277,
  "" => 201,
  "" => -800,
  "" => 624,
  "" => 302,
  "" => 1792,
  "" => -1212,
  "" => 798,
  "" => -960,
  "" => 887,
  "" => -695,
  "" => 535,
  "" => -697,
  "" => 753,
  "" => -507,
  "" => 974,
  "" => -822,
  "" => 1811,
  "" => 463,
  "" => 1082,
  "" => -270,
  "E1" => 306,
  "" => -673,
  "" => -496,
}

Class Method Summary collapse

Class Method Details

.segment(text) ⇒ Object



1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
# File 'lib/tiny_japanese_segmenter.rb', line 1385

def segment(text)
  return [] if text.nil? || text.strip.length == 0

  result = []

  segments = %w[B3 B2 B1]
  ctypes = %w[O O O]

  text.chars.each do |char|
    segments << char
    ctypes << ctype(char)
  end

  segments.concat(%w[E1 E2 E3])
  ctypes.concat(%w[O O O])

  word = segments[3]
  p1 = "U"
  p2 = "U"
  p3 = "U"

  4.upto(segments.size - 4) do |i|
    score = BIAS
    w1 = segments[i - 3]
    w2 = segments[i - 2]
    w3 = segments[i - 1]
    w4 = segments[i]
    w5 = segments[i + 1]
    w6 = segments[i + 2]
    c1 = ctypes[i - 3]
    c2 = ctypes[i - 2]
    c3 = ctypes[i - 1]
    c4 = ctypes[i]
    c5 = ctypes[i + 1]
    c6 = ctypes[i + 2]
    score += UP1[p1].to_i
    score += UP2[p2].to_i
    score += UP3[p3].to_i
    score += BP1[p1 + p2].to_i
    score += BP2[p2 + p3].to_i
    score += UW1[w1].to_i
    score += UW2[w2].to_i
    score += UW3[w3].to_i
    score += UW4[w4].to_i
    score += UW5[w5].to_i
    score += UW6[w6].to_i
    score += BW1[w2 + w3].to_i
    score += BW2[w3 + w4].to_i
    score += BW3[w4 + w5].to_i
    score += TW1[w1 + w2 + w3].to_i
    score += TW2[w2 + w3 + w4].to_i
    score += TW3[w3 + w4 + w5].to_i
    score += TW4[w4 + w5 + w6].to_i
    score += UC1[c1].to_i
    score += UC2[c2].to_i
    score += UC3[c3].to_i
    score += UC4[c4].to_i
    score += UC5[c5].to_i
    score += UC6[c6].to_i
    score += BC1[c2 + c3].to_i
    score += BC2[c3 + c4].to_i
    score += BC3[c4 + c5].to_i
    score += TC1[c1 + c2 + c3].to_i
    score += TC2[c2 + c3 + c4].to_i
    score += TC3[c3 + c4 + c5].to_i
    score += TC4[c4 + c5 + c6].to_i
    # score += TC5[c4 + c5 + c6].to_i
    score += UQ1[p1 + c1].to_i
    score += UQ2[p2 + c2].to_i
    score += UQ3[p3 + c3].to_i
    score += BQ1[p2 + c2 + c3].to_i
    score += BQ2[p2 + c3 + c4].to_i
    score += BQ3[p3 + c2 + c3].to_i
    score += BQ4[p3 + c3 + c4].to_i
    score += TQ1[p2 + c1 + c2 + c3].to_i
    score += TQ2[p2 + c2 + c3 + c4].to_i
    score += TQ3[p3 + c1 + c2 + c3].to_i
    score += TQ4[p3 + c2 + c3 + c4].to_i

    p = "O"

    if score > 0
      result.push(word)
      word = ""
      p = "B"
    end

    p1 = p2
    p2 = p3
    p3 = p
    word += segments[i]
  end

  result.push(word)

  result
end