検索の編集履歴ソース - ソフトウェアテスト勉強室 wiki

メニュー

更新履歴

取得中です。

カウンター

検索 - (2007/04/03 (火) 01:46:53) のソース

*構造体
 typedef struct _bigram_pos_t bigram_pos_t;
 struct _bigram_pos_t {
     unsigned int       id;
     unsigned char      pos;
 };/* 3バイト */
 
 typedef struct _bigram_term_t bigram_term_t;
 struct _bigram_term_t {
     unsigned char      word[8];
     bigram_pos_t       pos[8];
 };/* 32バイト */
 
 typedef struct _bigram_docid_t bigram_docid_t;
 struct _bigram_docid_t {
     unsigned int       start;
     unsigned char      len;
 };/* 3バイト */



 文書(doc):
     - 文書ID
     - 著者
     - 題名
     - 作成日
 
 分かち文字(term)
     - 単語(word)              (8バイト)  unsigned char[8]
     - 文書内ID(id)            (2バイト)  unsigned int       (0～65535)
     - 文書内位置(pos)         (1バイト)  unsigned char      (0～127)
 
 0 日本語の検索をする
 1 全文検索する
 2 日本語の探索をする
 
 word(8)   (1+1) (1+1)
 ------------------------------------------------------
 日本      0 0   2 0
 本語      0 1   2 1
 語の      0 2   2 2
 の検      0 3
 検索      0 4   1 2
 索を      0 5   2 5
 をす      0 6   2 6
 する      0 7   1 4   2 7
 全文      1 0
 文検      1 1
 索す      1 3
 の探      2 3
 探索      2 4
 
 iword(4)  hash  hash ...
 ------------------------------------------------------
 の        h(検) h(探)
 索        h(を) h(す)
 ...
 
 文書ID    単語数  文書内ID群
 ------------------------------------------------------
 0         100     0
 1         380     1,2,3
 2         300     4,5
 3         90      6
 ...