Great Circle Associates

XCIN Mail-list
(December 2000)


Indexed By Date: [Previous] [Next] Indexed By Thread: [Previous] [Next]

Subject: Re: about xcin and addtsi....
From: Kuang-che Wu <kcwu@camel.ck.tp.edu.tw>
Organization: Taipei Chien-kuo Senior High School
Date: 7 Dec 2000 20:46:47 GMT
To: xcin@tlug.sinica.edu.tw
Reply-To: xcin@linux.org.tw

thhsieh@linux.org.tw 提到:
> : 1.對於 match 不到詞的字, 希望能夠先上字頻比較高的字
> :   這字頻也先不用太講究, 可以先拿一般注音輸入法第一選字位的字來應急
之前我說錯了, 現在就是先上字頻比較高的字, 
應該是先上 "獨某音, 字頻最高的字"
我想到的作法是, 根據詞庫以及詞庫中填的注音
統計每個字各種唸法出現的頻率, 用這個頻率來上字

一個小問題是, 這功能是不是要放到 libtabe 中, 而不限於只在 bims 中
因為這看起來滿基本、常用的

> : 2.希望能夠手動強迫斷詞
> :   像自然注音那樣, 可以在輸入法認定詞的中間,
> :   按 tab 強迫把該詞的字視為不同一詞
> 這兩個都需要先修改 libtabe/libbims, 然後再配合修改 xcin,工程比較大,
> 我們會慢慢來弄。
希望沒有人正在寫斷詞的 code....
我稍微修改 bims, 使得可以手動強迫斷詞

除了加上 tsiboundary[] 記錄斷詞位置之外,
我還改了
 bimsContextDP() 常 realloc, 改成 malloc 一次
 一點點的改進效率(真的是一點點, 應該沒什麼差別....)

現在可以強迫斷詞了, 但剛寫好沒作很多的測試, 
因此最好再檢查一下....雖然我猜應該沒什麼大問題
還有, 因為我太久沒用自然輸入法了, 因此不太記得 tab 的行為了...
印象中好像 tab 多按幾次會把詞接起來??
我現在的作法是, 按 tab 會切換該位置是否要把詞切斷
沒有其他特殊的功能

以下是 patch
--- bims.c.old	Fri Dec  8 04:00:58 2000
+++ bims.c	Fri Dec  8 04:01:12 2000
@@ -198,6 +198,10 @@
 	free(fbc->pindown);
       }
       fbc->pindown = (ZhiCode *)NULL;
+      if (fbc->tsiboundary) {
+	free(fbc->tsiboundary);
+      }
+      fbc->tsiboundary = (int*)NULL;
       fbc->state = BC_STATE_EDITING;
       fbc->bcid = 0;
       memset(&(fbc->zc), 0, sizeof(fbc->zc));
@@ -439,12 +443,13 @@
   struct YinSegInfo *ysinfo = (struct YinSegInfo *)NULL;
   int num_ysinfo = 0;
   struct smart_com *comb = (struct smart_com *)NULL;
-  int i, j, k, rval;
+  int i, j, k, z, rval;
   int yinhead, len, ncomb = 0;
   int ncand, *cand;
   int *tmpcand, tmpncand;
   struct TsiYinInfo ty;
   struct TsiInfo tsi;
+  int maxcount;
   int max_int, index;
   double max_double;
 #define TMP_BUFFER 80 /* this should be far enough for this implementaion */
@@ -483,15 +488,17 @@
       ysinfo[num_ysinfo].yindata = (Yin *)malloc(sizeof(Yin)*2);
       memcpy(ysinfo[num_ysinfo].yindata, bc->yin+yinhead, sizeof(Yin)*2);
       num_ysinfo++;
-      /* done duplicate a two-character word */
-      memset(&ty, 0, sizeof(ty));
-      ty.yinlen = 2;
-      ty.yin = ysinfo[num_ysinfo-1].yindata;
-      rval = ydb->Get(ydb, &ty);
-      if (!rval) {
-	/* tsiyin exists, verify if it has pindown character */
- 	if (!bimsVerifyPindown(bc, &ty, yinhead, -1)) {
-	  break;
+      if(!bc->tsiboundary[yinhead+1]) {
+	/* done duplicate a two-character word */
+	memset(&ty, 0, sizeof(ty));
+	ty.yinlen = 2;
+	ty.yin = ysinfo[num_ysinfo-1].yindata;
+	rval = ydb->Get(ydb, &ty);
+	if (!rval) {
+	  /* tsiyin exists, verify if it has pindown character */
+ 	  if (!bimsVerifyPindown(bc, &ty, yinhead, -1)) {
+	    break;
+	  }
 	}
       }
       /* no such tsiyin, handle word by word */
@@ -506,6 +513,14 @@
       break;
     }
     for (i = len-yinhead; i > 0; i--) {
+      for(z = 1; z < i; z++) {
+        if(bc->tsiboundary[yinhead+z]) {
+          break;
+        }
+      }
+      if(z != i) {
+        continue;
+      }
       memset(&ty, 0, sizeof(ty));
       ty.yinlen = i;
       memcpy(yin, bc->yin+yinhead, sizeof(Yin)*i);
@@ -519,6 +534,14 @@
       }
       for (j = len-yinhead-i; j >= 0; j--) {
 	if (j > 0) {
+          for(z = 1; z < j; z++) {
+            if(bc->tsiboundary[yinhead+i+z]) {
+              break;
+            }
+          }
+          if(z != j) {
+            continue;
+          }
 	  memset(&ty, 0, sizeof(ty));
 	  ty.yinlen = j;
 	  memcpy(yin, bc->yin+yinhead+i, sizeof(Yin)*j);
@@ -532,7 +555,18 @@
 	  }
 	}
 	for (k = len-yinhead-i-j; k >= 0; k--) {
+	  if (k > 0 && j == 0) {
+	    continue;
+	  }
 	  if (k > 0) {
+            for(z = 1; z < k; z++) {
+              if(bc->tsiboundary[yinhead+i+j+z]) {
+                break;
+              }
+            }
+	    if(z != k) {
+	      continue;
+	    }
 	    memset(&ty, 0, sizeof(ty));
 	    ty.yinlen = k;
 	    memcpy(yin, bc->yin+yinhead+i+j, sizeof(Yin)*k);
@@ -545,9 +579,6 @@
 	      continue;
 	    }
 	  }
-	  if (k > 0 && j == 0) {
-	    continue;
-	  }
 	  comb = (struct smart_com *)
 	    realloc(comb, sizeof(struct smart_com)*(ncomb+1));
 	  comb[ncomb].s1 = yinhead;
@@ -565,17 +596,20 @@
     /* rule 1: largest sum of three-tsi */
     max_int = 0;
     index = 0;
+    maxcount = 0;
     for (i = 0; i < ncomb; i++) {
       if (comb[i].len > max_int) {
 	index = i;
 	max_int = comb[i].len;
+	maxcount = 1;
+      } else if(comb[i].len == max_int) {
+        maxcount++;
       }
     }
     ncand = 0;
-    cand = (int *)NULL;
+    cand = (int *)malloc(sizeof(int)*maxcount);
     for (i = 0; i < ncomb; i++) {
       if (comb[i].len == max_int) {
-	cand = (int *)realloc(cand, sizeof(int)*(ncand+1));
 	cand[ncand] = i;
 	ncand++;
       }
@@ -588,6 +622,7 @@
     else { /* ambiguity */
       /* rule 2: largest average word length */
       max_double = 0;
+      maxcount = 0;
       for (i = 0; i < ncand; i++) {
 	index = cand[i];
 	comb[index].avg_word_len = 0;
@@ -611,15 +646,17 @@
 	comb[index].avg_word_len /= j;
 	if (comb[index].avg_word_len > max_double) {
 	  max_double = comb[index].avg_word_len;
+	  maxcount = 1;
+	} else if(comb[index].avg_word_len == max_double) {
+	  maxcount++;
 	}
       }
 
       tmpncand = 0;
-      tmpcand = (int *)NULL;
+      tmpcand = (int *)malloc(sizeof(int)*maxcount);
       for (i = 0; i < ncand; i++) {
 	index = cand[i];
 	if (comb[index].avg_word_len == max_double) {
-	  tmpcand = (int *)realloc(tmpcand, sizeof(int)*(tmpncand+1));
 	  tmpcand[tmpncand] = index;
 	  tmpncand++;
 	}
@@ -637,6 +674,7 @@
       else { /* ambiguity */
 	/* rule 3: smallest variance of word length */
 	max_double = 1000; /* this is misleading */
+	maxcount = 0;
 	for (i = 0; i < ncand; i++) {
 	  index = cand[i];
 	  comb[index].smallest_var = 0;
@@ -654,15 +692,17 @@
 	  comb[index].smallest_var /= 3;
 	  if (comb[index].smallest_var < max_double) {
 	    max_double = comb[index].smallest_var;
+	    maxcount = 1;
+	  } else if(comb[index].smallest_var == max_double) {
+	    maxcount++;
 	  }
 	}
 	
 	tmpncand = 0;
-	tmpcand = (int *)NULL;
+	tmpcand = (int *)malloc(sizeof(int)*maxcount);
 	for (i = 0; i < ncand; i++) {
 	  index = cand[i];
 	  if (comb[index].smallest_var == max_double) {
-	    tmpcand = (int *)realloc(tmpcand, sizeof(int)*(tmpncand+1));
 	    tmpcand[tmpncand] = index;
 	    tmpncand++;
 	  }
@@ -681,6 +721,7 @@
 	  int max_ref;
 	  /* rule 4: largest sum of tsi ref count */
 	  max_double = 0;
+	  maxcount = 0;
 	  for (i = 0; i < ncand; i++) {
 	    index = cand[i];
 	    comb[index].largest_sum = 0;
@@ -750,15 +791,17 @@
 
 	    if (comb[index].largest_sum > max_double) {
 	      max_double = comb[index].largest_sum;
+	      maxcount = 1;
+	    } else if(comb[index].largest_sum == max_double) {
+	      maxcount++;
 	    }
 	  }
 	  
 	  tmpncand = 0;
-	  tmpcand = (int *)NULL;
+	  tmpcand = (int *)malloc(sizeof(int)*maxcount);
 	  for (i = 0; i < ncand; i++) {
 	    index = cand[i];
 	    if (comb[index].largest_sum == max_double) {
-	      tmpcand = (int *)realloc(tmpcand, sizeof(int)*(tmpncand+1));
 	      tmpcand[tmpncand] = index;
 	      tmpncand++;
 	    }
@@ -926,6 +969,7 @@
  * XK_Right     move the internal cursor one zhi right
  * XK_Backspace
  * XK_Delete    delete the zhi in front of the internal cursor
+ * XK_Tab       switch the tsi boundary
  * XK_Return    does nothing so far, client may request for string
  * others       depends on the key mapping the client uses
  *
@@ -1006,6 +1050,9 @@
 	  memmove(bc->pindown+(bc->yinpos-1),
 		  bc->pindown+(bc->yinpos),
 		  sizeof(ZhiCode)*(bc->yinlen-(bc->yinpos-1)));
+	  memmove(bc->tsiboundary+(bc->yinpos-1),
+	  	  bc->tsiboundary+(bc->yinpos),
+	  	  sizeof(int)*(bc->yinlen-(bc->yinpos-1)));
 	}
 	else { /* the last character */
 	  bc->internal_text[(bc->yinlen-1)*2] = (unsigned char)NULL;
@@ -1017,6 +1064,14 @@
       }
     }
     return(BC_VAL_IGNORE);
+  case XK_Tab:
+    if(strlen((char *)bc->zc.string) == 0 && bc->yinlen >0 &&
+       bc->yinlen!=bc->yinpos) {
+      bc->tsiboundary[bc->yinpos]=!bc->tsiboundary[bc->yinpos];
+      bimsContextSmartEdit(bc);
+      return(BC_VAL_ABSORB);
+    }
+    return(BC_VAL_IGNORE);
   case XK_Return:
     return(BC_VAL_ABSORB);
   default:
@@ -1091,6 +1146,11 @@
       memmove(bc->pindown+(bc->yinpos+1), bc->pindown+(bc->yinpos),
 	      sizeof(ZhiCode)*(bc->yinlen-bc->yinpos));
       bc->pindown[bc->yinpos] = 0;
+      bc->tsiboundary = (int *)realloc(bc->tsiboundary,
+				       sizeof(int)*(bc->yinlen+1));
+      memmove(bc->tsiboundary+(bc->yinpos+1), bc->tsiboundary+(bc->yinpos),
+	      sizeof(int)*(bc->yinlen-bc->yinpos));
+      bc->tsiboundary[bc->yinpos] = 0;
       bc->yinlen++;
       bc->yinpos++;
       bimsZuYinContextClear(&(bc->zc));
@@ -1311,9 +1371,14 @@
   for (;*str;) {
     bc->pindown[i] =
       (*str)*256 + *(str+1);
+    bc->tsiboundary[i] = 0;
     i++;
     str += 2;
   }
+  if(i != bc->yinlen)
+    bc->tsiboundary[i] = 1;
+  if(bc->yinpos != 0)
+    bc->tsiboundary[bc->yinpos] = 1;
   bimsContextSmartEdit(bc);
 
   return(0);
@@ -1369,6 +1434,8 @@
   memmove(bc->internal_text, bc->internal_text+newlen*2,
 	  sizeof(unsigned char)*((bc->yinlen-newlen)*2+1));
   memmove(bc->pindown, bc->pindown+len, sizeof(ZhiCode)*(bc->yinlen-newlen));
+  memmove(bc->tsiboundary, bc->tsiboundary+len,
+	  sizeof(int)*(bc->yinlen-newlen));
   bc->yinlen -= newlen;
 
   bimsContextSmartEdit(bc);
--- bims.h.old	Fri Dec  8 04:01:06 2000
+++ bims.h	Fri Dec  8 04:01:12 2000
@@ -78,6 +78,8 @@
   unsigned char       *internal_text;  /* text: internal text           */
   ZhiCode             *pindown;        /* flag: indicating that the Zhi
 				                is pinned down          */
+  int                 *tsiboundary;    /* flag: indicating that the Tsi
+						boundary		*/
   int                  state;          /* editing or zhi selection mode */
   unsigned long int    bcid;           /* bimsContext Identifier        */
   int                  keymap;         /* the type of keymap it uses    */
To Unsubscribe: send mail to majordomo@linux.org.tw
with "unsubscribe xcin" in the body of the message


Follow-Ups:
Indexed By Date Previous: [注音填補]4001-4500
From: ACES.bbs@openbazaar.net (小黃)
Next: Re: about xcin and addtsi....
From: Chih-Hao Tsai <hao520@yahoo.com>
Indexed By Thread Previous: [注音填補]4001-4500
From: ACES.bbs@openbazaar.net (小黃)
Next: Re: about xcin and addtsi....
From: Chih-Hao Tsai <hao520@yahoo.com>