Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro committed Dec 10, 2024
1 parent 066d9e8 commit 5cc5535
Show file tree
Hide file tree
Showing 25 changed files with 964 additions and 654,868 deletions.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

6 changes: 1 addition & 5 deletions benchmarking/get_sharegpt_trace.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
from dataclasses import asdict, dataclass, field
import json
import os
import random
import requests
import argparse
import json, os, random, requests, argparse
from tqdm.asyncio import tqdm
from typing import List, Optional
from collections import OrderedDict
Expand Down
125 changes: 116 additions & 9 deletions benchmarking/get_wildchat_trace.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
import datasets
from transformers import AutoTokenizer
from tqdm import tqdm
import json, os, argparse
import json, os, random, requests, argparse
from dataclasses import asdict, dataclass, field
from typing import List, Optional


from collections import OrderedDict
import pandas as pd
from math import ceil
from random import uniform
import numpy as np

@dataclass
class TraceEntry:
prompt: str
response: str
prompt_length: int
response_length: int
arrival_time: int

@dataclass
class TraceMetadata:
Expand All @@ -25,15 +29,99 @@ class TraceMetadata:
min_response_length: int
avg_response_length: float
max_total_length: int
trace_type: str
arrival_rate: float

@dataclass
class Trace:
entries: List[TraceEntry] = field(default_factory=list)
metadata: TraceMetadata = field(default_factory=lambda: TraceMetadata(0, 0, 0, 0, 0, 0, 0, 0,0))
metadata: TraceMetadata = field(default_factory=lambda: TraceMetadata(0, 0, 0, 0, 0, 0, 0, 0, 0, "offline", 0.0))

def generate_arrival_rates_splitwise(n, target_arrival_rate_sec, seed):
def get_splitwise_trace(trace_type="conv"):
# Import Microsoft LLM 1 hour trace
df_trace = pd.read_csv("https://raw.githubusercontent.com/Azure/AzurePublicDataset/master/data/AzureLLMInferenceTrace_"+trace_type+".csv", parse_dates=["TIMESTAMP"])
req_times = (pd.to_datetime(df_trace["TIMESTAMP"]).astype(int)//1000) # Timestamps are in microseconds
req_times = req_times - req_times.min()
req_times = req_times.tolist()
return req_times

debug_verbose = True
req_times = get_splitwise_trace()

np.random.seed(seed)
random.seed(seed)

microsec = 1000000
avg_arrival_rate = len(req_times) / (req_times[-1]/float(microsec)) # Request per second. Computed that way to enforce working with numbers of reasonable orders of magnitude
if debug_verbose:
print("Avg arrival rate of original trace (req/s): ", avg_arrival_rate)
scale_factor = float(target_arrival_rate_sec) / avg_arrival_rate
if debug_verbose:
print("Scale factor to obtain target arrival rate: ", scale_factor)

# Buckets are 1 second timeframes
nb_buckets = ceil(req_times[-1] / microsec)
j = 0
# print("Number of buckets: ", nb_buckets)
bucket_sizes=[]
for i in range(nb_buckets):
bucket_size = 0
while(j < len(req_times) and req_times[j] >= i*microsec and req_times[j] < (i+1)*microsec):
bucket_size += 1
j += 1
bucket_size = bucket_size*scale_factor
prob = bucket_size - int(bucket_size)
bucket_size = int(bucket_size) + int(uniform(0, 1) <= prob)
bucket_sizes.append(bucket_size)

arrival_times = []
for arrival_time, num_requests in enumerate(bucket_sizes):
for i in range(num_requests):
arrival_times.append(arrival_time)
if len(arrival_times) > n:
arrival_times = arrival_times[:n]
elif len(arrival_times) < n:
print(f"Warning: not enough arrival_times ({len(arrival_times)}) in scaled trace to generate arrival times for all requests ({n})")
arrival_times += arrival_times[:n-len(arrival_times)]
assert(len(arrival_times) == n)
return arrival_times

def generate_poisson_arrivals(n, target_arrival_rate_sec, seed):
"""
Generate arrival times for n requests following a Poisson process.
Parameters:
n (int): Number of requests to generate
arrival_rate (float): Average arrival rate (requests per second)
Returns:
numpy.ndarray: Array of arrival times in seconds
"""
np.random.seed(seed)
random.seed(seed)

def build_trace(
dataset: datasets.Dataset, model_name: str, num_entries: int, max_length: int, seed: int, apply_chat_template: bool = False
):
# Generate n exponentially distributed inter-arrival times
# For a Poisson process, inter-arrival times follow exponential distribution
inter_arrival_times = np.random.exponential(scale=1/target_arrival_rate_sec, size=n)

# Calculate cumulative sum to get arrival times
arrival_times = np.cumsum(inter_arrival_times)

# Round to 6 decimal places for practical purposes (microsecond precision)
arrival_times = np.round(arrival_times, decimals=6)

return arrival_times


def build_trace(dataset: datasets.Dataset,
model_name: str,
num_entries: int,
max_length: int,
seed: int,
trace_type: str = "offline",
arrival_rate: float = 0.0,
apply_chat_template: bool = False):
tokenizer = AutoTokenizer.from_pretrained(model_name)

dataset = (
Expand Down Expand Up @@ -69,8 +157,17 @@ def build_trace(
min_response_length=float("inf"),
avg_response_length=0,
max_total_length=0,
trace_type=trace_type,
arrival_rate=arrival_rate
)

arrival_times = num_entries*[0.0]
if trace_type == "poisson":
arrival_times = generate_poisson_arrivals(num_entries, arrival_rate, seed)
elif trace_type == "splitwise":
arrival_times = generate_arrival_rates_splitwise(num_entries, arrival_rate, seed)
assert(len(arrival_times) == num_entries)

for prompt, response in tqdm(pairs, desc="Processing HF trace"):
if apply_chat_template:
prompt = tokenizer.apply_chat_template(
Expand All @@ -82,7 +179,7 @@ def build_trace(
response_length = len(tokenizer(response)["input_ids"])
if prompt_length + response_length > max_length:
continue
new_entry = TraceEntry(prompt, response, prompt_length, response_length)
new_entry = TraceEntry(prompt, response, prompt_length, response_length, arrival_times[len(trace.entries)])
trace.entries.append(new_entry)
trace_metadata.max_prompt_length = max(trace_metadata.max_prompt_length, prompt_length)
trace_metadata.min_prompt_length = min(trace_metadata.min_prompt_length, prompt_length)
Expand All @@ -96,6 +193,7 @@ def build_trace(
trace_metadata.avg_prompt_length /= len(trace.entries)
trace_metadata.avg_response_length /= len(trace.entries)
trace_metadata.avg_entries_per_partition = len(trace.entries)
trace_metadata.arrival_rate = arrival_rate

trace.metadata = trace_metadata

Expand Down Expand Up @@ -125,13 +223,22 @@ def save_trace(trace: Trace, output_path: str):
parser.add_argument("-n", "--num_entries", type=int, default=250, help="Number of entries")
parser.add_argument("-s", "--seed", type=int, default=12345, help="Random seed")
parser.add_argument("-o", "--output_file", type=str, default="./traces/wildchat.json", help="Output file name")
parser.add_argument("-t", "--trace-type", type=str, choices=["offline", "poisson", "splitwise"], default="offline", help="Arrival Times Trace Type")
parser.add_argument("-a", "--arrival-rate", type=float, default=0.0, help="Arrival Rate")
args = parser.parse_args()

# Change directory to that holding this script
os.chdir(os.path.dirname(os.path.abspath(__file__)))

dataset = datasets.load_dataset("allenai/WildChat")
trace = build_trace(dataset, args.model_name, args.num_entries, args.max_length, args.seed, apply_chat_template=False)
trace = build_trace(dataset,
args.model_name,
args.num_entries,
args.max_length,
args.seed,
trace_type=args.trace_type,
arrival_rate=args.arrival_rate,
apply_chat_template=False)
print("Build trace with the following metadata:")
print(trace.metadata)

Expand Down
17 changes: 10 additions & 7 deletions benchmarking/overhead_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,6 @@ NCPUS=16
FSIZE=76000
ZSIZE=200000

# MODEL_NAME="meta-llama/Meta-Llama-3-8B"
# PEFT_MODEL_NAME="goliaro/llama-3-8b-lora-dolly"
# NGPUS=8
# NCPUS=16
# FSIZE=30000
# ZSIZE=30000

# MODEL_NAME="JackFram/llama-160m"
# PEFT_MODEL_NAME="goliaro/llama-160m-lora"
# NGPUS=4
Expand All @@ -42,6 +35,13 @@ max_tokens_per_batch_values=(
512
)

max_finetuning_fwd_tokens_values=(
"0,8,24,56,120"
"0,8,24,56,120,248"
"0,8,24,56,120,248,504"
)
max_finetuning_bwd_layers="1,2,4,8,16,32,64"

mkdir -p $OUTPUT_FOLDER


Expand All @@ -57,6 +57,7 @@ mkdir -p $OUTPUT_FOLDER

for i in "${!max_tokens_per_batch_values[@]}"; do
MAX_TOKENS_PER_BATCH=${max_tokens_per_batch_values[$i]}
MAX_FINETUNING_FWD_TOKENS=${max_finetuning_fwd_tokens_values[$i]}
LOG_FILE="${OUTPUT_FOLDER}/test_${MAX_TOKENS_PER_BATCH}_tokens_per_batch.log"
rm $LOG_FILE || true
./inference/peft/overhead_test \
Expand All @@ -69,5 +70,7 @@ for i in "${!max_tokens_per_batch_values[@]}"; do
--max-requests-per-batch $BATCH_SIZE \
--max-tokens-per-batch $MAX_TOKENS_PER_BATCH \
--max-sequence-length $MAX_SEQ_LEN \
--max-fwd-finetuning-tokens $MAX_FINETUNING_FWD_TOKENS \
--num-layers-per-finetuning-step $max_finetuning_bwd_layers \
2>&1 | tee $LOG_FILE
done
4 changes: 2 additions & 2 deletions benchmarking/plot_finetuning_overheads.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def plot_bwd_overhead(filepath, model_name, tp_degree, bz, num_tokens_per_batch,
# print(f"Standard deviation of step time: {std_step_time:.3f} milliseconds")
print(f"Step time: {avg_step_time:.3f} ± {std_step_time:.3f} ms ({len(filtered_df)} entries)")

values_of_interest=[1,10,19,27,36,45,54,62,71,80]
values_of_interest=[0, 1,10,19,27,36,45,54,62,71,80]

# Second analysis: Variable finetuning tokens
filtered_df_2 = df[
Expand Down Expand Up @@ -141,7 +141,7 @@ def plot_bwd_overhead(filepath, model_name, tp_degree, bz, num_tokens_per_batch,
avg_step_time=('step_time', 'mean'),
std_step_time=('step_time', 'std')
).reset_index()

print(avg_std_df['avg_step_time'])
plt.errorbar(avg_std_df['num_bwd_layers'],
avg_std_df['avg_step_time'],
yerr=avg_std_df['std_step_time'],
Expand Down
754 changes: 503 additions & 251 deletions benchmarking/traces/sharegpt.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -847,6 +847,7 @@ class FFModel {
// Inference APIs
// ========================================
std::vector<GenerationResult> generate(std::vector<Request> const &requests);
std::vector<GenerationResult> generate_online(std::vector<Request> const &inference_requests, std::vector<Request> const &ft_requests);

Tensor create_tensor_legion_ordering(int num_dim,
int const dims[],
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ struct Request {
bool add_special_tokens = true;
bool warmup = false;
Status status = PENDING;
long long arrival_time_us = 0;
// inference fields
std::string prompt;
std::vector<BatchConfig::TokenId> tokens;
Expand Down
67 changes: 0 additions & 67 deletions inference/peft/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,73 +36,6 @@ target_link_libraries(${project_target1} -Wl,--whole-archive flexflow -Wl,--no-w
set(BIN_DEST "bin")
install(TARGETS ${project_target1} DESTINATION ${BIN_DEST})

# FWD benchmark
set(project_target2 peft_fwd_benchmark)
set(CPU_SRC2
${FLEXFLOW_CPP_DRV_SRC}
peft_fwd_benchmark.cc
../models/llama.cc
../models/opt.cc
../models/falcon.cc
../models/starcoder.cc
../models/mpt.cc)

if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
cuda_add_executable(${project_target2} ${CPU_SRC2})
if (FF_GPU_BACKEND STREQUAL "hip_cuda")
target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_NVIDIA__)
endif()
elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
set_source_files_properties(${CPU_SRC2} PROPERTIES LANGUAGE HIP)
hip_add_executable(${project_target2} ${CPU_SRC2})
if (FF_HIP_ARCH STREQUAL "")
message(FATAL_ERROR "FF_HIP_ARCH is empty!")
endif()
set_property(TARGET ${project_target2} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_AMD__)
else()
message(FATAL_ERROR "Compilation of ${project_target2} for ${FF_GPU_BACKEND} backend not yet supported")
endif()

target_include_directories(${project_target2} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
target_include_directories(${project_target2} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
target_link_libraries(${project_target2} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
set(BIN_DEST "bin")
install(TARGETS ${project_target2} DESTINATION ${BIN_DEST})

# BWD benchmark
set(project_target3 peft_bwd_benchmark)
set(CPU_SRC3
${FLEXFLOW_CPP_DRV_SRC}
peft_bwd_benchmark.cc
../models/llama.cc
../models/opt.cc
../models/falcon.cc
../models/starcoder.cc
../models/mpt.cc)

if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
cuda_add_executable(${project_target3} ${CPU_SRC3})
if (FF_GPU_BACKEND STREQUAL "hip_cuda")
target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_NVIDIA__)
endif()
elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
set_source_files_properties(${CPU_SRC3} PROPERTIES LANGUAGE HIP)
hip_add_executable(${project_target3} ${CPU_SRC3})
if (FF_HIP_ARCH STREQUAL "")
message(FATAL_ERROR "FF_HIP_ARCH is empty!")
endif()
set_property(TARGET ${project_target3} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_AMD__)
else()
message(FATAL_ERROR "Compilation of ${project_target3} for ${FF_GPU_BACKEND} backend not yet supported")
endif()

target_include_directories(${project_target3} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
target_include_directories(${project_target3} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
target_link_libraries(${project_target3} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
set(BIN_DEST "bin")
install(TARGETS ${project_target3} DESTINATION ${BIN_DEST})

# Online peft
set(project_target4 req_rate_benchmark)
Expand Down
Loading

0 comments on commit 5cc5535

Please sign in to comment.