Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

a large number of all-zero tensors appeared in the encoded tensors #3195

Open
rangehow opened this issue Jan 25, 2025 · 0 comments
Open

a large number of all-zero tensors appeared in the encoded tensors #3195

rangehow opened this issue Jan 25, 2025 · 0 comments

Comments

@rangehow
Copy link

rangehow commented Jan 25, 2025

While testing the official sample code, I encountered some issues. For instance, when I used this code to encode the dataset, a large number of all-zero tensors appeared in the final saved tensors, with a ratio of approximately 5 million out of 20 million.

import logging
import os
import numpy as np
import datasets
from torch.utils.data import DataLoader
from tqdm import tqdm
from sentence_transformers import LoggingHandler, SentenceTransformer
import argparse

logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)

def parse_dataset(dataset_str):
    if dataset_str.endswith('.tsv'):
        # 尝试读取TSV文件
        dataset = datasets.load_dataset("csv", data_files=dataset_str, delimiter="\t", num_proc=32)['train']
    else:
        dataset = datasets.load_from_disk(dataset_str)
    return dataset

def parse_args():
    parser = argparse.ArgumentParser(description="Process datasets with padding and multiprocessing.")
    parser.add_argument("--model", default="stella", help="Model name or path.")
    parser.add_argument("--dataset", required=True, help="Path to the dataset.")
    parser.add_argument("--batch_size", type=int, default=1024, help="Batch size per worker.")
    parser.add_argument("--output_dir", default="./embeddings", help="Directory to save embeddings.")
    return parser.parse_args()

if __name__ == "__main__":
    # Set params

    args = parse_args()

    # 载入数据集
    dataset = parse_dataset(args.dataset)

    # 加载模型
    model = SentenceTransformer(args.model, trust_remote_code=True)

    # 启动多进程池
    pool = model.start_multi_process_pool()


    # 设置DataLoader
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=8192,
        num_workers=8,  # 根据CPU核心数调整
        pin_memory=True,  # 启用pin_memory
        prefetch_factor=4,  # 增加预取批次数量
    )

    # Ensure the output directory exists
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # 用于存储所有批次的嵌入
    all_embeddings = []

    for i, batch in enumerate(tqdm(dataloader)):

        sentences = batch["text"]

        batch_emb = model.encode_multi_process(sentences, pool, batch_size=2048)


        print(f"Embeddings computed for batch {i+1}. Shape: {batch_emb.shape}")

        all_embeddings.append(batch_emb)


    all_embeddings = np.vstack(all_embeddings)
    print(f"Total embeddings shape: {all_embeddings.shape}")


    output_file = os.path.join(args.output_dir, "all_embeddings.npy")
    np.save(output_file, all_embeddings)
    print(f"All embeddings saved at {output_file}")


    model.stop_multi_process_pool(pool)

    print("All embeddings have been processed and saved.")

I detected this issue by loading the already saved tensors.

vector = np.load('/mnt/dolphinfs/hdd_pool/docker/user/hadoop-aipnlp/INS/ruanjunhao04/ruanjunhao/chatrag-bench/embeddings/final_embeddings_topiocqa.npy', mmap_mode='r')
    
    
vector_need_to_be_checked=model.encode(dataset[10000011]['text'])
are_close = np.allclose(vector[10000011], vector_need_to_be_checked, atol=1e-6)

are_close are false at specific row(like 1025 or 10000011)

The following code output 5m length

zero_mask = np.all(all_embeddings == 0, axis=1)
zero_indices = np.where(zero_mask)[0]  
print(f"Found {len(zero_indices)} zero embeddings at indices: {zero_indices}")

What could be the possible reasons for this issue? Thank you for your assistance.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant