最近研究一個翻譯系統,對老師上傳的一段文本自動拆分成句,乍一聽好像很簡單哦,split分隔下句號不就完事了嘛!。。。mdzz還是太年輕,一不小心上當了,還有嘆號問好雙引號呢~!當然這個也不算什么,找個正則表達式就好啦^_^!太天真了!!!勞資突然發現英文簡直了,竟然還有縮略詞!!!這尼瑪怎么分析哦,一頓翻山越嶺,發現國內的相關文章有限,對於縮略詞都不能有很好的支持,於是在這個時間段,國內嚴禁翻牆的時間。。。我偷偷翻牆去問問歪果仁了,警察叔叔不要抓我,我只是愛學習的騷年Σ( ° △ °|||)︴    然而實際情況是,歪果仁自己也煩躁他們自己的語言太事逼。。。為什么就不能像中文一樣有明顯的句子邊界呢。。。好吧,我特么也是醉了,正當我一籌莫展之際,一個白胡子老頭從天而降,說,騷年,需要幫助嗎。別誤會,不是援助交際ヽ(=^・ω・^=)丿。。。好吧言歸正傳,我看到了NLP,並找到了lingpipe,引用起來相當簡單,一個下午從接觸到實現徹底搞定,說了一堆廢話,開始正文!

import java.util.ArrayList;

import java.util.List;

import com.aliasi.sentences.IndoEuropeanSentenceModel;

import com.aliasi.sentences.SentenceModel;

import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;

import com.aliasi.tokenizer.Tokenizer;

import com.aliasi.tokenizer.TokenizerFactory;

public class SpliteTextInSentence {

static final TokenizerFactory TOKENIZER_FACTORY = IndoEuropeanTokenizerFactory.INSTANCE;

static final SentenceModel SENTENCE_MODEL = new IndoEuropeanSentenceModel();


public static void main(String[] args) {

SpliteTextInSentence s = new SpliteTextInSentence();

String str1 = "Water-splashing Festival is one of the most important festivals in the world, which is popular among Dai people of China and the southeast Asia. It has been celebrated by people for more than 700 years and now this festival is an necessary way for people to promote the cooperation and communication among countries.";

String str2 = "This is how I tried to split a paragraph into a sentence. But, there is a problem. My paragraph includes dates like Jan.13, 2014 , words like U.S and numbers like 2.2. They all got split by the above code.";

String str3 = "My friend holds a Msc. in Computer Science.";

String str4 = "This is a test? This is a T.L.A. test!";

String text = "50 Cent XYZ120 DVD Player 50 Cent lawyer. Person is john, he is a lawyer.";

String str5 = "\"I do not ask for your forgiveness,\" he said, in a tone that became more firm and forceful. \"I have no illusions, and I am convinced that death is waiting for me: it is just.\"";

String str6 = "\"The Times have had too much influence on me.\" He laughed bitterly and said to himself, \"it is only two steps away from death. Alone with me, I am still hypocritical... Ah, the 19th century!\"";

String str7 = "潑水節是世界上最重要節日之一,深受中國傣族和東南亞人民的喜愛。七百多年來,人們一直在慶祝這個節日,現在這個節日是促進國家間合作和交流的必要方式。";


List sl = testChunkSentences(s.splitfuhao(str7));




for (String row : sl) {






private static List testChunkSentences(String text) {

List result = new ArrayList();

List tokenList = new ArrayList();

List whiteList = new ArrayList();

Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(text.toCharArray(),

0, text.length());

tokenizer.tokenize(tokenList, whiteList);

String[] tokens = new String[tokenList.size()];

String[] whites = new String[whiteList.size()];



int[] sentenceBoundaries = SENTENCE_MODEL.boundaryIndices(tokens,


int sentStartTok = 0;

int sentEndTok = 0;

for (int i = 0; i < sentenceBoundaries.length; ++i) {

System.out.println("Sentense " + (i + 1) + ", sentense's length(from 0):" + (sentenceBoundaries[i]));

StringBuilder sb = new StringBuilder();

sentEndTok = sentenceBoundaries[i];

for (int j = sentStartTok; j <= sentEndTok; j++) {

sb.append(tokens[j]).append(whites[j + 1]);


sentStartTok = sentEndTok + 1;



//System.out.println("Final result:" + result);

return result;



public String splitfuhao(String str){

String[] ChineseInterpunction = { "“", "”", "‘", "’", "。", ",", ";", ":", "?", "!", "……", "—", "~", "(", ")", "《", "》" };

String[] EnglishInterpunction = { "\"", "\"", "'", "'", ".", ",", ";", ":", "?", "!", "…", "-", "~", "(", ")", "" };

for (int j = 0; j < ChineseInterpunction.length; j++)


//alert("txt.replace("+ChineseInterpunction[j]+", "+EnglishInterpunction[j]+")");

//String reg=str.matches(ChineseInterpunction[j],"g");

str = str.replace(ChineseInterpunction[j], EnglishInterpunction[j]+" ");


return str;






