perf: file encoding;perf: leave team code;@c121914yu perf: full text …

…search code (#3528) * perf: text encoding * perf: leave team code * perf: full text search code * fix: http status * perf: embedding search and vector avatar
labring · Jan 5, 2025 · 65ee518 · 65ee518
1 parent cf19058
commit 65ee518
Show file tree

Hide file tree

Showing 24 changed files with 344 additions and 99 deletions.
diff --git a/docSite/content/zh-cn/docs/development/upgrading/4818.md b/docSite/content/zh-cn/docs/development/upgrading/4818.md
@@ -13,4 +13,6 @@ weight: 806
 2. 新增 - 支持部门架构权限模式
 3. 优化 - 图片上传安全校验。并增加头像图片唯一存储，确保不会累计存储。
 4. 优化 - Mongo 全文索引表分离。
-5. 优化 - 知识库检索查询语句合并，同时减少查库数量。
+5. 优化 - 知识库检索查询语句合并，同时减少查库数量。
+6. 优化 - 文件编码检测，减少 CSV 文件乱码概率。
+7. 修复 - HTML 文件上传，base64 图片无法自动转图片链接。
diff --git a/packages/global/common/file/tools.ts b/packages/global/common/file/tools.ts
@@ -2,6 +2,7 @@ import { detect } from 'jschardet';
 import { documentFileType, imageFileType } from './constants';
 import { ChatFileTypeEnum } from '../../core/chat/constants';
 import { UserChatItemValueItemType } from '../../core/chat/type';
+import * as fs from 'fs';
 
 export const formatFileSize = (bytes: number): string => {
   if (bytes === 0) return '0 B';
@@ -16,6 +17,22 @@ export const formatFileSize = (bytes: number): string => {
 export const detectFileEncoding = (buffer: Buffer) => {
   return detect(buffer.slice(0, 200))?.encoding?.toLocaleLowerCase();
 };
+export const detectFileEncodingByPath = async (path: string) => {
+  // Get 64KB file head
+  const MAX_BYTES = 64 * 1024;
+  const buffer = Buffer.alloc(MAX_BYTES);
+
+  const fd = await fs.promises.open(path, 'r');
+  try {
+    // Read file head
+    const { bytesRead } = await fd.read(buffer, 0, MAX_BYTES, 0);
+    const actualBuffer = buffer.slice(0, bytesRead);
+
+    return detect(actualBuffer)?.encoding?.toLocaleLowerCase();
+  } finally {
+    await fd.close();
+  }
+};
 
 // Url => user upload file type
 export const parseUrlToFileType = (url: string): UserChatItemValueItemType['file'] | undefined => {

diff --git a/packages/service/common/file/gridfs/controller.ts b/packages/service/common/file/gridfs/controller.ts
@@ -4,7 +4,7 @@ import fsp from 'fs/promises';
 import fs from 'fs';
 import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
 import { MongoChatFileSchema, MongoDatasetFileSchema } from './schema';
-import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
+import { detectFileEncoding, detectFileEncodingByPath } from '@fastgpt/global/common/file/tools';
 import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
 import { MongoRawTextBuffer } from '../../buffer/rawText/schema';
 import { readRawContentByFileBuffer } from '../read/utils';
@@ -36,7 +36,6 @@ export async function uploadFile({
   path,
   filename,
   contentType,
-  encoding,
   metadata = {}
 }: {
   bucketName: `${BucketNameEnum}`;
@@ -45,7 +44,6 @@ export async function uploadFile({
   path: string;
   filename: string;
   contentType?: string;
-  encoding: string;
   metadata?: Record<string, any>;
 }) {
   if (!path) return Promise.reject(`filePath is empty`);
@@ -59,7 +57,7 @@ export async function uploadFile({
   // Add default metadata
   metadata.teamId = teamId;
   metadata.uid = uid;
-  metadata.encoding = encoding;
+  metadata.encoding = await detectFileEncodingByPath(path);
 
   // create a gridfs bucket
   const bucket = getGridBucket(bucketName);

diff --git a/packages/service/common/middle/reqFrequencyLimit.ts b/packages/service/common/middle/reqFrequencyLimit.ts
@@ -22,7 +22,6 @@ export function useReqFrequencyLimit(seconds: number, limit: number, force = fal
         expiredTime: addSeconds(new Date(), seconds)
       });
     } catch (_) {
-      res.status(429);
       jsonRes(res, {
         code: 429,
         error: ERROR_ENUM.tooManyRequest

diff --git a/packages/service/common/response/index.ts b/packages/service/common/response/index.ts
@@ -33,8 +33,7 @@ export const jsonRes = <T = any>(
 
     addLog.error(`Api response error: ${url}`, ERROR_RESPONSE[errResponseKey]);
 
-    res.status(ERROR_RESPONSE[errResponseKey].code);
-    return res.json(ERROR_RESPONSE[errResponseKey]);
+    return res.status(code).json(ERROR_RESPONSE[errResponseKey]);
   }
 
   // another error

diff --git a/packages/service/core/dataset/data/dataTextSchema.ts b/packages/service/core/dataset/data/dataTextSchema.ts
@@ -25,7 +25,7 @@ const DatasetDataTextSchema = new Schema({
     required: true
   },
   dataId: {
-    type: String,
+    type: Schema.Types.ObjectId,
     ref: DatasetDataCollectionName,
     required: true
   },
@@ -37,7 +37,7 @@ const DatasetDataTextSchema = new Schema({
 
 try {
   DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
-  DatasetDataTextSchema.index({ dataId: 'hashed' });
+  DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
 } catch (error) {
   console.log(error);
 }

diff --git a/packages/service/core/dataset/data/schema.ts b/packages/service/core/dataset/data/schema.ts
@@ -39,10 +39,6 @@ const DatasetDataSchema = new Schema({
     type: String,
     default: ''
   },
-  fullTextToken: {
-    type: String,
-    default: ''
-  },
   indexes: {
     type: [
       {
@@ -72,7 +68,13 @@ const DatasetDataSchema = new Schema({
     default: 0
   },
   rebuilding: Boolean,
-  inited: Boolean
+
+  // Abandon
+  fullTextToken: {
+    type: String,
+    default: ''
+  },
+  initFullText: Boolean
 });
 
 try {
@@ -85,13 +87,14 @@ try {
     updateTime: -1
   });
   // full text index
-  DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
+  // DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
   // Recall vectors after data matching
   DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
   DatasetDataSchema.index({ updateTime: 1 });
   // rebuild data
   DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
-  DatasetDataSchema.index({ inited: 'hashed' });
+
+  DatasetDataSchema.index({ initFullText: 1 });
 } catch (error) {
   console.log(error);
 }

diff --git a/packages/service/core/dataset/search/controller.ts b/packages/service/core/dataset/search/controller.ts
@@ -289,20 +289,22 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
       ).lean()
     ]);
 
-    const formatResult = dataList
-      .map((data, index) => {
-        const collection = collections.find((col) => String(col._id) === String(data.collectionId));
+    const formatResult = results
+      .map((item, index) => {
+        const collection = collections.find((col) => String(col._id) === String(item.collectionId));
         if (!collection) {
-          console.log('Collection is not found', data);
+          console.log('Collection is not found', item);
+          return;
+        }
+        const data = dataList.find((data) =>
+          data.indexes.some((index) => index.dataId === item.id)
+        );
+        if (!data) {
+          console.log('Data is not found', item);
           return;
         }
 
-        // add score to data(It's already sorted. The first one is the one with the most points)
-        const dataIdList = data.indexes.map((item) => item.dataId);
-        const maxScoreResult = results.find((item) => {
-          return dataIdList.includes(item.id);
-        });
-        const score = maxScoreResult?.score || 0;
+        const score = item?.score || 0;
 
         const result: SearchDataResponseItemType = {
           id: String(data._id),
@@ -320,8 +322,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
       })
       .filter(Boolean) as SearchDataResponseItemType[];
 
-    formatResult.sort((a, b) => b.score[0].value - a.score[0].value);
-
     return {
       embeddingRecallResults: formatResult,
       tokens
@@ -411,22 +411,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
       '_id name fileId rawLink externalFileId externalFileUrl',
       { ...readFromSecondary }
     ).lean();
-    // const [dataList, collections] = await Promise.all([
-    //   MongoDatasetData.find(
-    //     {
-    //       _id: { $in: searchResults.map((item) => item.dataId) }
-    //     },
-    //     '_id datasetId collectionId updateTime q a chunkIndex indexes',
-    //     { ...readFromSecondary }
-    //   ).lean(),
-    //   MongoDatasetCollection.find(
-    //     {
-    //       _id: { $in: searchResults.map((item) => item.collectionId) }
-    //     },
-    //     '_id name fileId rawLink externalFileId externalFileUrl',
-    //     { ...readFromSecondary }
-    //   ).lean()
-    // ]);
 
     return {
       fullTextRecallResults: searchResults
@@ -439,9 +423,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
             return;
           }
 
-          // const score =
-          //   searchResults.find((item) => String(item.dataId) === String(data._id))?.score || 0;
-
           return {
             id: String(data._id),
             datasetId: String(data.datasetId),
@@ -459,6 +440,135 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
       tokenLen: 0
     };
   };
+  const fullTextRecall2 = async ({
+    query,
+    limit,
+    filterCollectionIdList,
+    forbidCollectionIdList
+  }: {
+    query: string;
+    limit: number;
+    filterCollectionIdList?: string[];
+    forbidCollectionIdList: string[];
+  }): Promise<{
+    fullTextRecallResults: SearchDataResponseItemType[];
+    tokenLen: number;
+  }> => {
+    if (limit === 0) {
+      return {
+        fullTextRecallResults: [],
+        tokenLen: 0
+      };
+    }
+
+    const searchResults = (
+      await Promise.all(
+        datasetIds.map(async (id) => {
+          return MongoDatasetDataText.aggregate(
+            [
+              {
+                $match: {
+                  teamId: new Types.ObjectId(teamId),
+                  datasetId: new Types.ObjectId(id),
+                  $text: { $search: jiebaSplit({ text: query }) },
+                  ...(filterCollectionIdList
+                    ? {
+                        collectionId: {
+                          $in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
+                        }
+                      }
+                    : {}),
+                  ...(forbidCollectionIdList && forbidCollectionIdList.length > 0
+                    ? {
+                        collectionId: {
+                          $nin: forbidCollectionIdList.map((id) => new Types.ObjectId(id))
+                        }
+                      }
+                    : {})
+                }
+              },
+              {
+                $sort: {
+                  score: { $meta: 'textScore' }
+                }
+              },
+              {
+                $limit: limit
+              },
+              {
+                $project: {
+                  _id: 1,
+                  collectionId: 1,
+                  dataId: 1,
+                  score: { $meta: 'textScore' }
+                }
+              }
+            ],
+            {
+              ...readFromSecondary
+            }
+          );
+        })
+      )
+    ).flat() as (DatasetDataTextSchemaType & { score: number })[];
+
+    // Get data and collections
+    const [dataList, collections] = await Promise.all([
+      MongoDatasetData.find(
+        {
+          _id: { $in: searchResults.map((item) => item.dataId) }
+        },
+        '_id datasetId collectionId updateTime q a chunkIndex indexes',
+        { ...readFromSecondary }
+      ).lean(),
+      MongoDatasetCollection.find(
+        {
+          _id: { $in: searchResults.map((item) => item.collectionId) }
+        },
+        '_id name fileId rawLink externalFileId externalFileUrl',
+        { ...readFromSecondary }
+      ).lean()
+    ]);
+
+    return {
+      fullTextRecallResults: searchResults
+        .map((item, index) => {
+          const collection = collections.find(
+            (col) => String(col._id) === String(item.collectionId)
+          );
+          if (!collection) {
+            console.log('Collection is not found', item);
+            return;
+          }
+          const data = dataList.find((data) => String(data._id) === String(item.dataId));
+          if (!data) {
+            console.log('Data is not found', item);
+            return;
+          }
+
+          return {
+            id: String(data._id),
+            datasetId: String(data.datasetId),
+            collectionId: String(data.collectionId),
+            updateTime: data.updateTime,
+            q: data.q,
+            a: data.a,
+            chunkIndex: data.chunkIndex,
+            indexes: data.indexes,
+            ...getCollectionSourceData(collection),
+            score: [
+              {
+                type: SearchScoreTypeEnum.fullText,
+                value: item.score || 0,
+                index
+              }
+            ]
+          };
+        })
+        .filter(Boolean) as SearchDataResponseItemType[],
+      tokenLen: 0
+    };
+  };
   const reRankSearchResult = async ({
     data,
     query
@@ -526,7 +636,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
             forbidCollectionIdList,
             filterCollectionIdList
           }),
-          fullTextRecall({
+          fullTextRecall2({
             query,
             limit: fullTextLimit,
             filterCollectionIdList,