

m4>= 1.4.13

autoconf >= 2.65

automake >= 1.11

libtool >= 2.2.6b

coreseek安装需要预装的软件:apt-getinstall make gcc g++ automake libtool mysql-client libmysqlclient15-dev   libxml2-dev libexpat1-dev



$ cd coreseek-3.2.14


$ locale






$ cat testpack/var/test/test.xml


$ cd mmseg-3.2.14

##ubuntu环境下,需要使用ACLOCAL_FLAGS="-I/usr/share/aclocal" ./bootstrap

$ ./bootstrap

$ ./configure --prefix=/usr/local/mmseg3

$ make && make install



$ /usr/local/mmseg3/bin/mmseg -d /usr/local/mmseg3/etc src/t1.txt

中文/x 分/x 词/x 测试/x

中国人/x 上海市/x

Word Splite took: 1 ms.


$ cd csft-3.2.14


$ sh buildconf.sh

$ ./configure --prefix=/usr/local/coreseek--without-python --without-unixodbc --with-mmseg--with-mmseg-includes=/usr/local/mmseg3/include/mmseg/--with-mmseg-libs=/usr/local/mmseg3/lib/ --without-mysql

$ make && make install


$ /usr/local/coreseek/bin/indexer -c/usr/local/coreseek/etc/sphinx-min.conf.dist


Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]

Copyright (c) 2007-2010,

Beijing Choice Software Technologies Inc (http://www.coreseek.com)

using config file '/usr/local/coreseek/etc/sphinx-min.conf.dist'...

total 0 reads, 0.000 sec, 0.0 kb/call avg, 0.0 msec/call avg

total 0 writes, 0.000 sec, 0.0 kb/call avg, 0.0 msec/call avg







mysql依赖库:apt-get installmake mysql-client libmysqlclient15-dev  libxml2-dev libexpat1-dev


$ cd csft-3.2.14

$ make clean

$ ./configure--prefix=/usr/local/coreseek --without-unixodbc --with-mmseg--with-mmseg-includes=/usr/local/mmseg3/include/mmseg/--with-mmseg-libs=/usr/local/mmseg3/lib/ --with-mysql


$ make && make install


$ cd testpack

$ /usr/local/coreseek/bin/indexer -c etc/csft.conf


Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]

Copyright (c) 2007-2010,

Beijing Choice Software Technologies Inc (http://www.coreseek.com)

using config file 'etc/csft.conf'...

total 0 reads, 0.000 sec, 0.0 kb/call avg, 0.0 msec/call avg

total 0 writes, 0.000 sec, 0.0 kb/call avg, 0.0 msec/call avg


##csft-4.0版显示:ERROR: nothing todo.


$  /usr/local/coreseek/bin/indexer-c etc/csft.conf --all


Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]

Copyright (c) 2007-2010,

Beijing Choice Software Technologies Inc (http://www.coreseek.com)

using config file 'etc/csft.conf'...

indexing index 'xml'...

collected 3 docs, 0.0 MB

sorted 0.0 Mhits, 100.0% done

total 3 docs, 7585 bytes

total 0.075 sec, 101043 bytes/sec, 39.96 docs/sec

total 2 reads, 0.000 sec, 5.6 kb/call avg, 0.0 msec/call avg

total 7 writes, 0.000 sec, 3.9 kb/call avg, 0.0 msec/call avg

$ /usr/local/coreseek/bin/indexer -c etc/csft.conf xml


Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]

Copyright (c) 2007-2010,

Beijing Choice Software Technologies Inc (http://www.coreseek.com)

using config file 'etc/csft.conf'...

indexing index 'xml'...

collected 3 docs, 0.0 MB

sorted 0.0 Mhits, 100.0% done

total 3 docs, 7585 bytes

total 0.069 sec, 109614 bytes/sec, 43.35 docs/sec

total 2 reads, 0.000 sec, 5.6 kb/call avg, 0.0 msec/call avg

total 7 writes, 0.000 sec, 3.9 kb/call avg, 0.0 msec/call avg

$ /usr/local/coreseek/bin/search -c etc/csft.conf


Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]

Copyright (c) 2007-2010,

Beijing Choice Software Technologies Inc (http://www.coreseek.com)

using config file 'etc/csft.conf'...

index 'xml': query '': returned 3 matches of 3 total in 0.093 sec

displaying matches:

1. document=1, weight=1, published=Thu Apr  1 22:20:07 2010, author_id=1

2. document=2, weight=1, published=Thu Apr  1 23:25:48 2010, author_id=1

3. document=3, weight=1, published=Thu Apr  1 12:01:00 2010, author_id=2


$ /usr/local/coreseek/bin/search -c etc/csft.conf -a Twittter和Opera都提供了搜索服务


Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]

Copyright (c) 2007-2010,

Beijing Choice Software Technologies Inc (http://www.coreseek.com)

using config file 'etc/csft.conf'...

index 'xml': query 'Twittter和Opera都提供了搜索服务 ':returned 3 matches of 3 total in 0.038 sec

displaying matches:

1. document=3, weight=24, published=Thu Apr  1 12:01:00 2010, author_id=2

2. document=1, weight=4, published=Thu Apr  1 22:20:07 2010, author_id=1

3. document=2, weight=3, published=Thu Apr  1 23:25:48 2010, author_id=1


1. 'twittter': 1 documents, 3 hits

2. '和': 3 documents, 15 hits

3. 'opera': 1 documents, 25 hits

4. '都': 2 documents, 4 hits

5. '提供': 0 documents, 0 hits

6. '了': 3 documents, 18 hits

7. '搜索': 2 documents, 5 hits

8. '服务': 1 documents, 1 hits

$ /usr/local/coreseek/bin/searchd -c etc/csft.conf


Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]

Copyright (c) 2007-2010,

Beijing Choice Software Technologies Inc (http://www.coreseek.com)

usingconfig file 'etc/csft.conf'...

listening on all interfaces, port=9312

##如要停止搜索服务,请使用/usr/local/coreseek/bin/searchd-c etc/csft.conf --stop

##如要已启动服务,要更新索引,请使用/usr/local/coreseek/bin/indexer-c etc/csft.conf --all --rotate








MySQL4.1起可以通过SET NAMES UTF8设定输出字符集为UTF-8,即使原始数据为GBK也可(Latin1不能直接使用,需先转换为UTF-8或者GBK字符集);




SET character_set_client = utf8 ;

SET character_set_connection = utf8 ;

SET character_set_database = utf8 ;

SET character_set_results = utf8 ;

SET character_set_server = utf8 ;

SET collation_connection = utf8 ;

SET collation_database = utf8 ;

SET collation_server = utf8 ;



# Minimal Sphinx configuration sample(clean, simple, functional)


source example


type                                      = mysql

sql_host                               = localhost

sql_user                               = root

sql_pass                              = root

sql_db                                  = nca

sql_port                               = 3306      # optional, default is 3306

sql_query_pre                          = SET NAMES utf8

sql_query_pre = REPLACE INTO sph_counter SELECT 1, MAX(id) FROMdocuments

sql_query = SELECT id,summary FROM documents WHERE id<=( SELECTmax_doc_id FROM sph_counter WHERE counter_id=1 )





sql_query_info                            = SELECT * FROMdocuments WHERE id=$id


source delta : example (增量索引,用于更新)


sql_query_pre = SET NAMES utf8


sql_query = SELECT id,summary FROM documents WHERE id>( SELECTmax_doc_id FROM sph_counter WHERE counter_id=1 )


index example


source                                           =example

path                                      =/usr/local/coreseek/var/data/example/

docinfo                                          =extern

charset_dictpath = /usr/local/mmseg3/etc/ #BSD、Linux环境下设置,/符号结尾

charset_type                      = zh_cn.utf-8


index delta : example


source = delta

path = /usr/local/coreseek/var/data/delta/

docinfo                                          =extern

charset_dictpath = /usr/local/mmseg3/etc/ #BSD、Linux环境下设置,/符号结尾

charset_type                      = zh_cn.utf-8




mem_limit                                    = 128M




port                                       = 9312

log                                                  =/usr/local/coreseek/var/log/searchd.log

query_log                                     =/usr/local/coreseek/var/log/query.log

read_timeout                     = 5

max_children                     = 60

pid_file                                 =/usr/local/coreseek/var/log/searchd.pid

max_matches                             = 1000

seamless_rotate                        = 1

preopen_indexes                        = 0

unlink_old                                     = 1


3) 开启searchd服务


先在 /usr/local/src/coreseek/var/log下创建一个searchd_example.pid文件,然后执行下面的代码:

/usr/local/coreseek/bin/searchd --config/usr/local/coreseek/etc/example.conf


/usr/local/coreseek/bin/indexer -c/usr/local/coreseek/etc/example.conf --all


/usr/local/coreseek/bin/search -c/usr/local/coreseek/etc/example.conf --phrase 南京 --limit 2;

6) 实时索引更新(合并索引->更新sph_counter中maxid值->清空增量索引)

开启searchd:/usr/local/coreseek/bin/searchd --config/usr/local/coreseek/etc/example.conf

1. 先建立一张增量索引记录表

CREATE TABLE sph_counter







3. ####第一次启动建立全索引

/usr/local/coreseek/bin/indexer -c /etc/example.conf--all

###启动searchd 后台模式,启动前一定先建立好全索引,不然启动失败或增量索引部分会索引不到

/usr/local/coreseek/bin/searchd -c /etc/example.conf  ###启动

/usr/local/coreseek/bin/searchd -c /etc/example.conf --stop  ###停止


/usr/local/coreseek/bin/indexer -c /etc/example.confdelta  --rotate


/usr/local/coreseek/bin/indexer -c etc/example.conf--merge example delta  --rotate  #合并索引

/usr/local/mysql/bin/mysql -hlocalhost-uroot -proot -dnca -e 'replace into sph_counter select 1,max(id) fromdocuments;' #更新sph_counter中maxid值

/usr/local/coreseek/bin/indexer -c /etc/example.confdelta  --rotate #清空增量索引





$cd /usr/local/coreseek/etc/

$vi delta.sh  (delta为增量索引)

$/usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/sphinx.conf delta --rotate

$vi test1.sh

/usr/local/coreseek/bin/indexer -c etc/example.conf--merge example delta  --rotate  #合并索引

/usr/local/mysql/bin/mysql -hlocalhost-uroot -proot -dnca -e 'replace into sph_counter select 1,max(id) fromdocuments;' #更新sph_counter中maxid值

/usr/local/coreseek/bin/indexer -c /etc/example.confdelta  --rotate #清空增量索引

$crontab -e

*/1 * * * * /usr/local/coreseek/etc/delta.sh #每隔一分钟运行一次

30 2 * * * /usr/local/coreseek/etc/test1.sh #每天半夜2:30运行

保存并对delta.sh/test1.sh设权限 chmod 755;



$service crond stop

$service crond start



1.去搜狗拼音下载常用词库: 链接.


3. 将txt转为utf8编码,写脚本将文件转为mmseg词典txt,这里给出一个php脚本的示例:


/usr/local/mmseg3/bin/mmseg -u /usr/local/mmseg3/etc/dict.txt
mkdir backup
mv uni.lib backup/
mv dict.txt.uni uni.lib


1.生成JAR 包(也可以直接导入java文件)

在coreseek-3.2.13-win32文件夹内有一个API目录,进入api\java 双击mk.cmd 生成jar包,双击mkdoc.cmd生成DOC文档,在那里还有一个test.java文件,现在我们可以基于这个基础上实现我们的JAVA api调用



*$Id: test.java 2055 2009-11-06 23:09:58Z shodan $


package com.xxxxxx.action;

import java.util.Date;

import org.sphx.api.SphinxClient;

import org.sphx.api.SphinxException;

import org.sphx.api.SphinxMatch;

import org.sphx.api.SphinxResult;

import org.sphx.api.SphinxWordInfo;


*Test class for sphinx API


public class test


publicstatic void main ( String[] argv ) throws SphinxException


//               if( argv==null || argv.length<1 )

//               {

//                         System.out.print( "Usage: java -jar sphinxapi.jar [OPTIONS] query words\n\n" );

//                         System.out.print( "Options are:\n" );

//                         System.out.print( "-h, --host <HOST>\tconnect to searchd at host HOST\n" );

//                         System.out.print( "-p, --port\t\tconnect to searchd at port PORT\n" );

//                         System.out.print( "-i, --index <IDX>\tsearch through index(es) specified byIDX\n" );

//                         System.out.print( "-s, --sortby <CLAUSE>\tsort matches by 'CLAUSE' in sort_extendedmode\n" );

//                         System.out.print( "-S, --sortexpr <EXPR>\tsort matches by 'EXPR' DESC in sort_exprmode\n" );

//                         System.out.print( "-a, --any\t\tuse 'match any word' matching mode\n" );

//                         System.out.print( "-b, --boolean\t\tuse 'boolean query' matching mode\n" );

//                         System.out.print( "-e, --extended\t\tuse 'extended query' matching mode\n" );

//                         System.out.print( "-ph,--phrase\t\tuse 'exact phrase' matching mode\n" );

System.out.print( "-f, --filter <ATTR>\tfilter by attribute 'ATTR' (default is'group_id')\n" );

System.out.print( "-v, --value <VAL>\tadd VAL to allowed 'group_id' valueslist\n" );

//                         System.out.print( "-g, --groupby <EXPR>\tgroup matches by 'EXPR'\n" );

//                         System.out.print( "-gs,--groupsort <EXPR>\tsort groups by 'EXPR'\n" );

System.out.print( "-d, --distinct <ATTR>\tcount distinct values of 'ATTR''\n");

//                         System.out.print( "-l, --limit <COUNT>\tretrieve COUNT matches (default: 20)\n");

//                         System.out.print( "-ga, --geoanchor <LATATTR> <LONGATTR> <LAT><LONG>\n" );

//                         System.out.print( "\t\t\tset anchor for geodistance\n" );

//                         System.out.print( "--select <EXPRS>\tselect the listed expressions only\n" );


//                         System.exit( 0 );

//               }

StringBufferq = new StringBuffer();

Stringhost = "";

intport = 3312;

intmode = SphinxClient.SPH_MATCH_EXTENDED;

//               Stringindex = "content";

Stringindex = "*";

intoffset = 0;

intlimit = 50;

intsortMode = SphinxClient.SPH_SORT_ATTR_DESC;

StringsortClause = "posttime";

StringgroupBy = "";

StringgroupSort = "";

SphinxClientcl = new SphinxClient();

/*parse arguments */

//               if( argv!=null)

//                         for( int i=0; i<argv.length; i++ )

//               {

//                         Stringarg = argv[i];

//                         if( "-h".equals(arg) || "--host".equals(arg) )                              host = argv[++i];

//                         elseif ( "-p".equals(arg) || "--port".equals(arg) )            port = Integer.parseInt ( argv[++i]);

//                         elseif ( "-i".equals(arg) || "--index".equals(arg) )            index = argv[++i];

//                         elseif ( "-s".equals(arg) || "--sortby".equals(arg) )                  { sortMode =SphinxClient.SPH_SORT_EXTENDED; sortClause = argv[++i]; }

//                         elseif ( "-S".equals(arg) || "--sortexpr".equals(arg) )     { sortMode = SphinxClient.SPH_SORT_EXPR;sortClause = argv[++i]; }

//                         elseif ( "-a".equals(arg) || "--any".equals(arg) )                       mode =SphinxClient.SPH_MATCH_ANY;

//                        else if ( "-b".equals(arg) ||"--boolean".equals(arg) )               mode= SphinxClient.SPH_MATCH_BOOLEAN;

//                         elseif ( "-e".equals(arg) || "--extended".equals(arg) )   mode = SphinxClient.SPH_MATCH_EXTENDED;

//                         elseif ( "-ph".equals(arg)|| "--phrase".equals(arg) )               mode =SphinxClient.SPH_MATCH_PHRASE;

//                         elseif ( "-e2".equals(arg) )                                                                        mode= SphinxClient.SPH_MATCH_EXTENDED2;

//                         elseif ( "-g".equals(arg) || "--group".equals(arg) )                   groupBy = argv[++i];

//                         elseif ( "-gs".equals(arg)|| "--groupsort".equals(arg) ) groupSort = argv[++i];

//                         elseif ( "-o".equals(arg) || "--offset".equals(arg) )                   offset =Integer.parseInt(argv[++i]);

//                         elseif ( "-l".equals(arg) || "--limit".equals(arg) )             limit =Integer.parseInt(argv[++i]);

//                         elseif ( "-ga".equals(arg)|| "--geoanchor".equals(arg) )        cl.SetGeoAnchor ( argv[++i], argv[++i],Float.parseFloat(argv[++i]), Float.parseFloat(argv[++i]) );

//                         elseif ( "--select".equals(arg) )                                                                cl.SetSelect( argv[++i] );

//                         elseq.append ( argv[i] ).append ( " " );

//               }


//               q.append("(浦发银行)|(中信证券)|(恒生电子)");


//设置sphinx 服务端,和端口

cl.SetServer( host, port );

cl.SetWeights( new int[] { 100, 1 } );


cl.SetMatchMode( mode );


cl.SetLimits( offset, limit );


cl.SetSortMode( sortMode, sortClause );

//               cl.SetFilterRange(arg0,arg1, arg2, arg3)

if( groupBy.length()>0 )

cl.SetGroupBy( groupBy, SphinxClient.SPH_GROUPBY_ATTR, groupSort );

SphinxResultres = cl.Query(q.toString(), index);

if( res==null )


System.err.println( "Error: " + cl.GetLastError() );

System.exit( 1 );


if( cl.GetLastWarning()!=null && cl.GetLastWarning().length()>0 )

System.out.println( "WARNING: " + cl.GetLastWarning() + "\n" );

/*print me out */

System.out.println( "Query '" + q + "' retrieved " + res.total + " of" + res.totalFound + " matches in " + res.time + "sec." );

System.out.println( "Query stats:" );

for( int i=0; i<res.words.length; i++ )


SphinxWordInfowordInfo = res.words[i];

System.out.println( "\t'" + wordInfo.word + "' found " + wordInfo.hits +" times in " + wordInfo.docs + " documents" );


System.out.println( "\nMatches:" );

for( int i=0; i<res.matches.length; i++ )


SphinxMatchinfo = res.matches[i];

System.out.print( (i+1) + ". id=" + info.docId + ", weight=" + info.weight);

if( res.attrNames==null || res.attrTypes==null )


for( int a=0; a<res.attrNames.length; a++ )


System.out.print( ", " + res.attrNames[a] + "=" );

if( ( res.attrTypes[a] & SphinxClient.SPH_ATTR_MULTI )!=0 )


System.out.print( "(" );

long[]attrM = (long[]) info.attrValues.get(a);

if( attrM!=null )

for ( intj=0; j<attrM.length; j++ )


if( j!=0 )

System.out.print( "," );

System.out.print( attrM[j] );


System.out.print( ")" );



switch( res.attrTypes[a] )






/*longs or floats; print as is */

System.out.print( info.attrValues.get(a) );



LongiStamp = (Long) info.attrValues.get(a);

Datedate = new Date ( iStamp.longValue()*1000 );

System.out.print( date.toString() );



System.out.print( "(unknown-attr-type=" + res.attrTypes[a] + ")" );









*$Id: test.java 2055 2009-11-06 23:09:58Z shodan $



这个地方得要注意下,你想查浦发银行,中信证券,恒生电子,如果直接写SPHINX会把它当成一个词语,这样就得到很少的结果,如果("(浦发银行)| (中信证券)|(恒生电子)")这样写,这会得到包括“浦发银行” 或“恒生电子” 或“中信证券”的关键词的结果(这个问题我找了快两天,一直以为是自己什么配置错,http://www.coreseek.cn/forum/2_1010_0.html#msg4267)

cl.SetMatchMode ( mode );
cl.SetSortMode ( sortMode, sortClause );


