OSDN Git Service

modified src/strokes.txt
[kp123/kp123.git] / src / recognize_kanji.c
index 29e959b..c8fe363 100644 (file)
@@ -69,32 +69,51 @@ gint LevenshteinDistance(gint l1, gchar *s1, gint l2, gchar *s2)
     return ret;
 }
 
-static kanji_result* rate_next_kanji(GList *strokes, gchar **sdata, gunichar2 *entry)
+static gchar* merge_strokes(gchar **sdata, gint *l1)
 {
-    kanji_result *res = calloc(1, sizeof(kanji_result));
-    res->uc = *++entry;
-    gunichar2 *bakptr = entry;
-    gint i, j, l, l1 = 0, l2 = 0;
+    gint i, j, l;
+    *l1 = 0;
     for(i = 0; i < g_strv_length(sdata); i++)
-       l1 += strlen(sdata[i]);
-    gchar *s1 = calloc(l1, sizeof(gchar));
+       *l1 += strlen(sdata[i]);
+    gchar *s1 = calloc(*l1, sizeof(gchar));
     for(i = 0, j = 0; i < g_strv_length(sdata); i++)
     {
        l = strlen(sdata[i]);
        g_memmove(&(s1[j]), sdata[i], l);
        j += l;
     }
-    for(l2 = 0; g_ascii_isalpha((gchar)*++entry); l2++);
-    gchar *s2 = calloc(l2, sizeof(gchar));
-    entry = bakptr;
-    for(i = 0; i < l2; s2[i++] = (gchar)(*++entry));
+    return s1;
+}
+
+static gchar* unichar_to_char(gunichar2 *entry, gint *l2)
+{
+    *l2 = 0;
+    gunichar2 *ptr = entry;
+    for(*l2 = 0; g_ascii_isalpha((gchar)*++ptr); (*l2)++);
+    gchar *s2 = calloc(*l2, sizeof(gchar));
+    gint i;
+    for(i = 0; i < *l2; s2[i++] = (gchar)(*++entry));
+    return s2;
+}
+
+static kanji_result* rate_next_kanji(GList *strokes, gchar **sdata, gunichar2 *entry)
+{
+    kanji_result *res = calloc(1, sizeof(kanji_result));
+    gint n1 = g_strv_length(sdata); 
+    gint n2 = *entry - 'A' + 1;
+    res->uc = *++entry;
+    gint l1, l2;
+    gchar *s1 = merge_strokes(sdata, &l1);
+    gchar *s2 = unichar_to_char(entry, &l2);
     res->dist += LevenshteinDistance(l1, s1, l2, s2);
     g_free(s1);
     g_free(s2);
+    entry += strlen(s2)*sizeof(gunichar2);
     entry++;
     if(*entry == '|')
     {
-       res->dist += pass_extra_filters(strokes, entry);
+       if(n1 >= n2)
+           res->dist += pass_extra_filters(strokes, entry);
     }
     return res;
 }
@@ -109,13 +128,13 @@ static gint kanji_results_compare(gpointer *ptr1, gpointer *ptr2)
     return 0;
 }
 
-static gunichar2* find_next_entry(gchar *allkanji, gunichar2 *entry, gint allkanjilen, gunichar2 key)
+static gunichar2* find_next_entry(gchar *allkanji, gunichar2 *entry, gint allkanjilen, gunichar2 key1, gunichar2 key2)
 {
     if(allkanji == (gchar*)entry)
     {
        ++entry;
-       if(*entry != key)
-           return find_next_entry(allkanji, entry, allkanjilen, key);
+       if(*entry != key1)
+           return find_next_entry(allkanji, entry, allkanjilen, key1, key2);
     }
     else
     {
@@ -126,10 +145,10 @@ static gunichar2* find_next_entry(gchar *allkanji, gunichar2 *entry, gint allkan
            if(*++entry == '\n')
            {
                entry++;
-               if(*entry == key)
-                   break;
-               if(*entry > key)
+               if(*entry > key2)
                    return 0;
+               if(*entry >= key1)
+                   break;
                ++entry;
            }
        }
@@ -139,14 +158,17 @@ static gunichar2* find_next_entry(gchar *allkanji, gunichar2 *entry, gint allkan
 
 static gunichar2* pick_kanji(GList *strokes, gchar **sdata, gchar *allkanji, gint allkanjilen)
 {
-    const gint MAX_DISTANCE = 5;
+    const gint MAX_COUNT = 25;
     gint datalen = g_strv_length(sdata), i;
+    gint delta = 1 + datalen/8;
     gunichar2 key = 'A' + datalen - 1;
+    gunichar2 key1 = key - delta, key2 = key + delta;
+    if(key1 < 'A') key1 = 'A';
     gunichar2 *entry = (gunichar2*)allkanji;
     if(key > 'Z')
        return 0;
 
-    entry = find_next_entry(allkanji, entry, allkanjilen, key);
+    entry = find_next_entry(allkanji, entry, allkanjilen, key1, key2);
     if(!entry)
        return 0;
     GPtrArray *arr = g_ptr_array_new();
@@ -155,18 +177,23 @@ static gunichar2* pick_kanji(GList *strokes, gchar **sdata, gchar *allkanji, gin
     {
        kanji_result *res = rate_next_kanji(strokes, sdata, entry);
        g_ptr_array_add(arr, res);
-       g_ptr_array_sort(arr, (GCompareFunc)kanji_results_compare);
-       for(i = arr->len - 1; i >= 0; i--)
+       entry = find_next_entry(allkanji, entry, allkanjilen, key1, key2);
+       if(!entry)
+           break;
+    }
+    g_ptr_array_sort(arr, (GCompareFunc)kanji_results_compare);
+    if(arr->len > MAX_COUNT)
+    {
+       kanji_result *res = g_ptr_array_index(arr, MAX_COUNT-1);
+       gint max_dist = res->dist;
+       for(i = arr->len - 1; i >= MAX_COUNT; i--)
        {
            kanji_result *res = g_ptr_array_index(arr, i);
-           if(res->dist > MAX_DISTANCE)
+           if(res->dist > max_dist)
                g_ptr_array_remove_index(arr, i);
            else
                break;
        }
-       entry = find_next_entry(allkanji, entry, allkanjilen, key);
-       if(!entry)
-           break;
     }
     gunichar2 *ret = calloc(arr->len + 1, sizeof(gunichar2));
     for(i = 0; i < arr->len; i++)