fix buffer boundary issues and cursor handling in IK tokenizer: (#1080)

* Adjusted the calculation of offset in fillBuffer() to correctly handle cases when the cursor reaches the end of the buffer, preventing duplicate processing of the last character. Co-authored-by: Hj Wu <[email protected]>
infinilabs · Dec 12, 2024 · 51d3979 · 51d3979
1 parent 5a04b42
commit 51d3979
Showing 1 changed file with 9 additions and 4 deletions.
diff --git a/core/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java b/core/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java
@@ -121,15 +121,20 @@ int fillBuffer(Reader reader) throws IOException{
     		readCount = reader.read(segmentBuff);
 			this.lastUselessCharNum = 0;
     	}else{
-    		int offset = this.available - this.cursor;
+			// moveCursor()只能将cursor移动到available-1，需要+1
+    		int offset = this.available - (this.cursor + 1);
     		if(offset > 0){
     			//最近一次读取的>最近一次处理的，将未处理的字串拷贝到segmentBuff头部
-    			System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset);
+				//当前cursor是已经处理过的字符，需要+1
+    			System.arraycopy(this.segmentBuff , this.cursor + 1, this.segmentBuff , 0 , offset);
     			readCount = offset;
     		}
     		//继续读取reader ，以onceReadIn - onceAnalyzed为起始位置，继续填充segmentBuff剩余的部分
-    		readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset);
-    	}            	
+    		int numRead = reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
+			if (numRead != -1) {
+				readCount += numRead;
+			}
+    	}
     	//记录最后一次从Reader中读入的可用字符长度
     	this.available = readCount;
     	//重置当前指针