Skip to content

Commit

Permalink
Github tar processing and file / directory parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
spolu committed Jan 4, 2024
1 parent 784ea95 commit 0edaa90
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 9 deletions.
24 changes: 18 additions & 6 deletions connectors/src/admin/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ import {
STOP_CONNECTOR_BY_TYPE,
SYNC_CONNECTOR_BY_TYPE,
} from "@connectors/connectors";
import {
cleanUpProcessRepository,
processRepository,
} from "@connectors/connectors/github/lib/github_api";
import {
getAuthObject,
getDocumentId,
Expand All @@ -28,10 +32,6 @@ import { NotionDatabase, NotionPage } from "@connectors/lib/models/notion";
import { SlackConfiguration } from "@connectors/lib/models/slack";
import { nango_client } from "@connectors/lib/nango_client";
import { Result } from "@connectors/lib/result";
import {
cleanUpProcessRepository,
processRepository,
} from "@connectors/connectors/github/lib/github_api";

const { NANGO_SLACK_CONNECTOR_ID } = process.env;

Expand Down Expand Up @@ -134,14 +134,26 @@ const github = async (command: string, args: parseArgs.ParsedArgs) => {
}

const installationId = connector.connectionId;
const { tempDir, files } = await processRepository(
const { tempDir, files, directories } = await processRepository(
installationId,
args.owner,
args.repo,
"999"
);

console.log(files);
files.forEach((f) => {
console.log(f);
});
directories.forEach((d) => {
console.log(d);
});

console.log(
`Found ${files.length} files in ${directories.length} directories`
);
console.log(
`Files total size: ${files.reduce((acc, f) => acc + f.sizeBytes, 0)}`
);

await cleanUpProcessRepository(tempDir);
}
Expand Down
49 changes: 46 additions & 3 deletions connectors/src/connectors/github/lib/github_api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,11 @@ const EXTENSION_WHITELIST = [
".yaml",
".yml",
".md",
".c",
".h",
".cc",
".cpp",
".hpp",
];

const FILENAME_WHITELIST = [
Expand Down Expand Up @@ -662,6 +667,15 @@ export async function processRepository(
parents: string[];
localFilePath: string;
}[] = [];
const seenDirs: { [key: string]: boolean } = {};
const directories: {
dirName: string;
dirPath: string[];
sourceUrl: string;
internalId: string;
parentInternalId: string | null;
parents: string[];
}[] = [];

// Iterate over the files in the temp directory.
for await (const file of getFiles(tempDir)) {
Expand All @@ -684,6 +698,7 @@ export async function processRepository(
.slice(1, -1);

const pathInternalIds = [];

for (let i = 0; i < path.length; i++) {
const p = `github-code-${repoId}-dir-${path.slice(0, i + 1).join("/")}`;
pathInternalIds.push(
Expand All @@ -705,24 +720,52 @@ export async function processRepository(
? null
: (pathInternalIds[pathInternalIds.length - 1] as string);

// Files
files.push({
fileName,
filePath: path,
sourceUrl: `https://github.com/${login}/${repoName}/blob/${defaultBranch}/${path.join(
"/"
)}/${fileName}`,
sourceUrl: `https://github.com/${login}/${repoName}/blob/${defaultBranch}/${join(
path.join("/"),
fileName
)}`,
sizeBytes: size,
documentId,
parentInternalId,
parents: pathInternalIds,
localFilePath: file,
});

// Directories
if (parentInternalId && !seenDirs[parentInternalId]) {
seenDirs[parentInternalId] = true;

const dirName = path[path.length - 1] || "";
const dirPath = path.slice(0, -1);
const internalId = parentInternalId;
const dirParentInternalId =
pathInternalIds.length === 2
? null
: (pathInternalIds[pathInternalIds.length - 2] as string);

directories.push({
dirName,
dirPath,
sourceUrl: `https://github.com/${login}/${repoName}/blob/${defaultBranch}/${join(
dirPath.join("/"),
dirName
)}`,
internalId,
parentInternalId: dirParentInternalId,
parents: pathInternalIds.slice(0, -1),
});
}
}
}

return {
tempDir,
files,
directories,
};
}

Expand Down

0 comments on commit 0edaa90

Please sign in to comment.