Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixes title standardization issue, contents having tokens issue and a… #862

Open
wants to merge 1 commit into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import pandas as pd


def fix_title_contents(title_column_name='title',
repo_column_name='repo_name',
dataset_column_name='dataset',
contents_column_name='contents'):

def fix_title_contents_func(table):
table = fix_title(table, title_column_name, repo_column_name, dataset_column_name)
table = fix_contents(table, contents_column_name, dataset_column_name)
return table

return fix_title_contents_func


def fix_bluepile_github_title(title: str, repo_name: str):
# Example - Filepath - dive-in-mall-master/mall-mini-program/components/hot-list/index.js, Repo Name - liuhuiAndroid/dive-in-mall
actual_repo_name = repo_name.split("/")[-1].strip("/")
if title.startswith(actual_repo_name):
title = '/'.join(title.split("/")[1:])
return title

def fix_abap_title(title: str, repo_name: str):
if title.startswith(repo_name):
title = title[len(repo_name)+1:]
return title

def fix_title(table: pd.DataFrame, title_column_name: str, repo_column_name: str, dataset_column_name: str):
# Remove forward slash from start of file - also removes forard slash of abap dataset
table[title_column_name] = table[title_column_name].str.lstrip("/")
# Remove repo name, org name and branch name from title
bg_idx = (table[dataset_column_name] == "bluepile_github")
if bg_idx.any():
table.loc[bg_idx, title_column_name] = table.loc[bg_idx].apply(
lambda row: fix_bluepile_github_title(row[title_column_name], row[repo_column_name]), axis=1)
abap_idx = (table[dataset_column_name] == "abap")
if abap_idx.any():
table.loc[abap_idx, title_column_name] = table.loc[abap_idx].apply(
lambda row: fix_abap_title(row[title_column_name], row[repo_column_name]), axis=1)
return table

def fix_startcoder_contents(contents: str):
def is_tag_absent(line: str):
if line.startswith("<filename>"):
return False
if line.startswith("<reponame>"):
return False
if line.startswith("<gh_stars>"):
return False
return True

contents_lines = contents.splitlines()
contents_lines = [ele for ele in contents_lines if is_tag_absent(ele)]
return '\n'.join(contents_lines)

def fix_contents(table, contents_column_name, dataset_column_name):
sc_idx = (table[dataset_column_name] == "starcoder")
if sc_idx.any():
table.loc[sc_idx, contents_column_name] = table.loc[sc_idx, contents_column_name].apply(fix_startcoder_contents)
return table

def prepend_filename_token_filepath(title, repo, contents):
file_path = f"{repo.split('/')[-1]}/{title}"
result = f"<filename>{file_path}\n{contents}"
return result

if __name__ == "__main__":
df = pd.read_csv("/data/shanmukh/forked_repos/data-prep-kit/sc.csv")
df_copy = df.copy(deep=True)
df = fix_title_contents(title_column_name="filepath")(df)
df
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,14 @@
import pandas as pd
import pyarrow as pa
from dpk_repo_level_order.internal.check_languages import (
get_dominant_language_repo_packing,
)
get_dominant_language_repo_packing,)
from dpk_repo_level_order.internal.sorting.semantic_ordering import (
check_and_update_title,
sort_by_path,
sort_sem,
)
from func_timeout.exceptions import FunctionTimedOut

from dpk_repo_level_order.internal.fix_title_contents_add_filepath import prepend_filename_token_filepath

SORT_BY_PATH = "SORT_BY_PATH"
SORT_SEMANTIC = "SORT_SEMANTIC"
Expand Down Expand Up @@ -160,7 +159,14 @@ def lang_distribution(grouping_column):
lang_dist[k.as_py()] = v.as_py()
return lang_dist

title_column_name = "title"
super_row = table.column("contents").to_pylist()
titles = table.column(title_column_name).to_pylist()
repo_names = table.column(repo_column_name).to_pylist()
super_row = [
prepend_filename_token_filepath(title, repo_name, contents)
for title, repo_name, contents in zip(titles, repo_names, super_row)
]
repo_doc_ids = table.column("document_id").to_pylist()
lang_dist = lang_distribution(language_column_name)

Expand Down Expand Up @@ -189,7 +195,7 @@ def lang_distribution(grouping_column):
return new_table


def get_transforming_func(sorting_func=None, superrows_func=None, filename_func=None, language_column_name="language"):
def get_transforming_func(sorting_func=None, superrows_func=None, filename_func=None, language_column_name="language", fix_title_contents_func=None):
"""
This function takes three optional functions as input and returns a
function that can be applied to a pyarrow table and file name.
Expand All @@ -214,8 +220,10 @@ def get_transforming_func(sorting_func=None, superrows_func=None, filename_func=

def my_transform(table, file_name):
out_table = table
if fix_title_contents_func:
out_table = fix_title_contents_func(out_table)
if sorting_func:
out_table = sorting_func(table, file_name)
out_table = sorting_func(out_table, file_name)
if filename_func:
file_name = filename_func(table, file_name)
if superrows_func:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,15 @@ def _prepare_mapper_function(self):
get_dominant_language_func,
get_transforming_func,
)
from dpk_repo_level_order.internal.fix_title_contents_add_filepath import fix_title_contents
title_column_name='title',
repo_column_name='repo_name',
dataset_column_name='dataset',
contents_column_name='contents'
fix_title_contents_func = fix_title_contents(title_column_name, repo_column_name, dataset_column_name, contents_column_name)
mapper_function_params = mapper_function_params | {
"fix_title_contents_func": fix_title_contents_func,
}

if self.sorting_enabled:
self.logger.info(f"Repo level sorting is enabled. Algo: {self.sorting_algo}")
Expand Down
Loading