From db8a14abef75fdc73d303b7bd417e6e69a355cba Mon Sep 17 00:00:00 2001 From: Lord of Abyss <103809695+Abyss-lord@users.noreply.github.com> Date: Mon, 13 Jan 2025 05:52:30 +0800 Subject: [PATCH 01/40] [#6177] improve(CLI): Refactor ownership commands in Gravitino CLI (#6188) ### What changes were proposed in this pull request? Refactor ownership commands in Gravitino CLI. ### Why are the changes needed? Fix: #6177 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? local test. --- .../gravitino/cli/GravitinoCommandLine.java | 41 +----- .../gravitino/cli/OwnerCommandHandler.java | 128 ++++++++++++++++++ .../gravitino/cli/TestOwnerCommands.java | 79 +++++++++++ 3 files changed, 208 insertions(+), 40 deletions(-) create mode 100644 clients/cli/src/main/java/org/apache/gravitino/cli/OwnerCommandHandler.java diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java index 675a96d36a8..9c9ce6810ba 100644 --- a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java @@ -127,7 +127,7 @@ private void executeCommand() { if (CommandActions.HELP.equals(command)) { handleHelpCommand(); } else if (line.hasOption(GravitinoOptions.OWNER)) { - handleOwnerCommand(); + new OwnerCommandHandler(this, line, command, ignore, entity).handle(); } else if (entity.equals(CommandEntities.COLUMN)) { handleColumnCommand(); } else if (entity.equals(CommandEntities.TABLE)) { @@ -554,45 +554,6 @@ private void handleHelpCommand() { } } - /** - * Handles the command execution for Objects based on command type and the command line options. - */ - private void handleOwnerCommand() { - String url = getUrl(); - String auth = getAuth(); - String userName = line.getOptionValue(GravitinoOptions.LOGIN); - FullName name = new FullName(line); - String metalake = name.getMetalakeName(); - String entityName = line.getOptionValue(GravitinoOptions.NAME); - - Command.setAuthenticationMode(auth, userName); - - switch (command) { - case CommandActions.DETAILS: - newOwnerDetails(url, ignore, metalake, entityName, entity).handle(); - break; - - case CommandActions.SET: - { - String owner = line.getOptionValue(GravitinoOptions.USER); - String group = line.getOptionValue(GravitinoOptions.GROUP); - - if (owner != null && group == null) { - newSetOwner(url, ignore, metalake, entityName, entity, owner, false).handle(); - } else if (owner == null && group != null) { - newSetOwner(url, ignore, metalake, entityName, entity, group, true).handle(); - } else { - System.err.println(ErrorMessages.INVALID_SET_COMMAND); - } - break; - } - - default: - System.err.println(ErrorMessages.UNSUPPORTED_ACTION); - break; - } - } - /** * Handles the command execution for filesets based on command type and the command line options. */ diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/OwnerCommandHandler.java b/clients/cli/src/main/java/org/apache/gravitino/cli/OwnerCommandHandler.java new file mode 100644 index 00000000000..7e41fb478ae --- /dev/null +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/OwnerCommandHandler.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.cli; + +import org.apache.commons.cli.CommandLine; +import org.apache.gravitino.cli.commands.Command; + +/** Handles the command execution for Owner based on command type and the command line options. */ +public class OwnerCommandHandler extends CommandHandler { + private final GravitinoCommandLine gravitinoCommandLine; + private final CommandLine line; + private final String command; + private final boolean ignore; + private final String url; + private final FullName name; + private final String metalake; + private final String entityName; + private final String owner; + private final String group; + private final String entity; + + /** + * Constructs a {@link OwnerCommandHandler} instance. + * + * @param gravitinoCommandLine The Gravitino command line instance. + * @param line The command line arguments. + * @param command The command to execute. + * @param ignore Ignore server version mismatch. + * @param entity The entity to execute the command on. + */ + public OwnerCommandHandler( + GravitinoCommandLine gravitinoCommandLine, + CommandLine line, + String command, + boolean ignore, + String entity) { + this.gravitinoCommandLine = gravitinoCommandLine; + this.line = line; + this.command = command; + this.ignore = ignore; + + this.url = getUrl(line); + this.owner = line.getOptionValue(GravitinoOptions.USER); + this.group = line.getOptionValue(GravitinoOptions.GROUP); + this.name = new FullName(line); + this.metalake = name.getMetalakeName(); + this.entityName = name.getName(); + this.entity = entity; + } + /** Handles the command execution logic based on the provided command. */ + @Override + protected void handle() { + String userName = line.getOptionValue(GravitinoOptions.LOGIN); + Command.setAuthenticationMode(getAuth(line), userName); + + if (entityName == null && !CommandEntities.METALAKE.equals(entity)) { + System.err.println(ErrorMessages.MISSING_NAME); + Main.exit(-1); + } + if (!executeCommand()) { + System.err.println(ErrorMessages.UNSUPPORTED_COMMAND); + Main.exit(-1); + } + } + + /** + * Executes the specific command based on the command type. + * + * @return true if the command is supported, false otherwise + */ + private boolean executeCommand() { + switch (command) { + case CommandActions.DETAILS: + handleDetailsCommand(); + return true; + + case CommandActions.SET: + handleSetCommand(); + return true; + + default: + return false; + } + } + + /** Handles the "DETAILS" command. */ + private void handleDetailsCommand() { + gravitinoCommandLine + .newOwnerDetails(url, ignore, metalake, entityName, entity) + .validate() + .handle(); + } + + /** Handles the "SET" command. */ + private void handleSetCommand() { + if (owner != null && group == null) { + gravitinoCommandLine + .newSetOwner(url, ignore, metalake, entityName, entity, owner, false) + .validate() + .handle(); + } else if (owner == null && group != null) { + gravitinoCommandLine + .newSetOwner(url, ignore, metalake, entityName, entity, group, true) + .validate() + .handle(); + } else { + System.err.println(ErrorMessages.INVALID_SET_COMMAND); + Main.exit(-1); + } + } +} diff --git a/clients/cli/src/test/java/org/apache/gravitino/cli/TestOwnerCommands.java b/clients/cli/src/test/java/org/apache/gravitino/cli/TestOwnerCommands.java index 0c2b2cf91e5..12f617380ca 100644 --- a/clients/cli/src/test/java/org/apache/gravitino/cli/TestOwnerCommands.java +++ b/clients/cli/src/test/java/org/apache/gravitino/cli/TestOwnerCommands.java @@ -19,16 +19,25 @@ package org.apache.gravitino.cli; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.ArgumentMatchers.isNull; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Options; import org.apache.gravitino.cli.commands.OwnerDetails; import org.apache.gravitino.cli.commands.SetOwner; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -36,10 +45,23 @@ class TestOwnerCommands { private CommandLine mockCommandLine; private Options mockOptions; + private final ByteArrayOutputStream outContent = new ByteArrayOutputStream(); + private final ByteArrayOutputStream errContent = new ByteArrayOutputStream(); + private final PrintStream originalOut = System.out; + private final PrintStream originalErr = System.err; + @BeforeEach void setUp() { mockCommandLine = mock(CommandLine.class); mockOptions = mock(Options.class); + System.setOut(new PrintStream(outContent)); + System.setErr(new PrintStream(errContent)); + } + + @AfterEach + public void restoreStreams() { + System.setOut(originalOut); + System.setErr(originalErr); } @Test @@ -67,6 +89,7 @@ void testSetOwnerUserCommand() { "catalog", "admin", false); + doReturn(mockSetOwner).when(mockSetOwner).validate(); commandLine.handleCommandLine(); verify(mockSetOwner).handle(); } @@ -96,6 +119,7 @@ void testSetOwnerGroupCommand() { "catalog", "ITdept", true); + doReturn(mockSetOwner).when(mockSetOwner).validate(); commandLine.handleCommandLine(); verify(mockSetOwner).handle(); } @@ -116,7 +140,62 @@ void testOwnerDetailsCommand() { .when(commandLine) .newOwnerDetails( GravitinoCommandLine.DEFAULT_URL, false, "metalake_demo", "postgres", "catalog"); + doReturn(mockOwnerDetails).when(mockOwnerDetails).validate(); commandLine.handleCommandLine(); verify(mockOwnerDetails).handle(); } + + @Test + void testOwnerDetailsCommandWithoutName() { + Main.useExit = false; + when(mockCommandLine.hasOption(GravitinoOptions.METALAKE)).thenReturn(true); + when(mockCommandLine.getOptionValue(GravitinoOptions.METALAKE)).thenReturn("metalake_demo"); + when(mockCommandLine.hasOption(GravitinoOptions.NAME)).thenReturn(false); + when(mockCommandLine.hasOption(GravitinoOptions.OWNER)).thenReturn(true); + GravitinoCommandLine commandLine = + spy( + new GravitinoCommandLine( + mockCommandLine, mockOptions, CommandEntities.CATALOG, CommandActions.DETAILS)); + + assertThrows(RuntimeException.class, commandLine::handleCommandLine); + verify(commandLine, never()) + .newOwnerDetails( + eq(GravitinoCommandLine.DEFAULT_URL), + eq(false), + eq("metalake_demo"), + eq(null), + eq(CommandEntities.CATALOG)); + + String errOutput = new String(errContent.toByteArray(), StandardCharsets.UTF_8).trim(); + assertEquals(ErrorMessages.MISSING_NAME, errOutput); + } + + @Test + void testSetOwnerUserCommandWithoutUserAndGroup() { + Main.useExit = false; + when(mockCommandLine.hasOption(GravitinoOptions.METALAKE)).thenReturn(true); + when(mockCommandLine.getOptionValue(GravitinoOptions.METALAKE)).thenReturn("metalake_demo"); + when(mockCommandLine.hasOption(GravitinoOptions.NAME)).thenReturn(true); + when(mockCommandLine.getOptionValue(GravitinoOptions.NAME)).thenReturn("postgres"); + when(mockCommandLine.hasOption(GravitinoOptions.USER)).thenReturn(false); + when(mockCommandLine.hasOption(GravitinoOptions.GROUP)).thenReturn(false); + when(mockCommandLine.hasOption(GravitinoOptions.OWNER)).thenReturn(true); + GravitinoCommandLine commandLine = + spy( + new GravitinoCommandLine( + mockCommandLine, mockOptions, CommandEntities.CATALOG, CommandActions.SET)); + + assertThrows(RuntimeException.class, commandLine::handleCommandLine); + verify(commandLine, never()) + .newSetOwner( + eq(GravitinoCommandLine.DEFAULT_URL), + eq(false), + eq("metalake_demo"), + eq("postgres"), + eq(CommandEntities.CATALOG), + isNull(), + eq(false)); + String errOutput = new String(errContent.toByteArray(), StandardCharsets.UTF_8).trim(); + assertEquals(ErrorMessages.INVALID_SET_COMMAND, errOutput); + } } From 80d6daa3f89464319942052f4bffd9e931095184 Mon Sep 17 00:00:00 2001 From: TungYuChiang <75083792+TungYuChiang@users.noreply.github.com> Date: Mon, 13 Jan 2025 05:54:16 +0800 Subject: [PATCH 02/40] [#6149] improve(CLI): Refactor column commands in Gavitino CLI (#6190) ### What changes were proposed in this pull request? Refactor column commands CLI ### Why are the changes needed? Fix: #6149 ### Does this PR introduce _any_ user-facing change? None ### How was this patch tested? Tested locally --- .../gravitino/cli/ColumnCommandHandler.java | 236 ++++++++++++++++++ .../gravitino/cli/GravitinoCommandLine.java | 137 +--------- 2 files changed, 237 insertions(+), 136 deletions(-) create mode 100644 clients/cli/src/main/java/org/apache/gravitino/cli/ColumnCommandHandler.java diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/ColumnCommandHandler.java b/clients/cli/src/main/java/org/apache/gravitino/cli/ColumnCommandHandler.java new file mode 100644 index 00000000000..96f056c1a3c --- /dev/null +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/ColumnCommandHandler.java @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.cli; + +import com.google.common.collect.Lists; +import java.util.List; +import org.apache.commons.cli.CommandLine; +import org.apache.gravitino.cli.commands.Command; + +/** Handles the command execution for Columns based on command type and the command line options. */ +public class ColumnCommandHandler extends CommandHandler { + private final GravitinoCommandLine gravitinoCommandLine; + private final CommandLine line; + private final String command; + private final boolean ignore; + private final String url; + private final FullName name; + private final String metalake; + private final String catalog; + private final String schema; + private final String table; + private String column; + + /** + * Constructs a {@link ColumnCommandHandler} instance. + * + * @param gravitinoCommandLine The Gravitino command line instance. + * @param line The command line arguments. + * @param command The command to execute. + * @param ignore Ignore server version mismatch. + */ + public ColumnCommandHandler( + GravitinoCommandLine gravitinoCommandLine, CommandLine line, String command, boolean ignore) { + this.gravitinoCommandLine = gravitinoCommandLine; + this.line = line; + this.command = command; + this.ignore = ignore; + + this.url = gravitinoCommandLine.getUrl(); + this.name = new FullName(line); + this.metalake = name.getMetalakeName(); + this.catalog = name.getCatalogName(); + this.schema = name.getSchemaName(); + this.table = name.getTableName(); + } + + /** Handles the command execution logic based on the provided command. */ + @Override + protected void handle() { + String userName = line.getOptionValue(GravitinoOptions.LOGIN); + Command.setAuthenticationMode(gravitinoCommandLine.getAuth(), userName); + + List missingEntities = Lists.newArrayList(); + if (catalog == null) missingEntities.add(CommandEntities.CATALOG); + if (schema == null) missingEntities.add(CommandEntities.SCHEMA); + if (table == null) missingEntities.add(CommandEntities.TABLE); + + if (CommandActions.LIST.equals(command)) { + checkEntities(missingEntities); + handleListCommand(); + return; + } + + this.column = name.getColumnName(); + if (column == null) missingEntities.add(CommandEntities.COLUMN); + checkEntities(missingEntities); + + if (!executeCommand()) { + System.err.println(ErrorMessages.UNSUPPORTED_ACTION); + Main.exit(-1); + } + } + + /** + * Executes the specific command based on the command type. + * + * @return true if the command is supported, false otherwise + */ + private boolean executeCommand() { + switch (command) { + case CommandActions.DETAILS: + handleDetailsCommand(); + return true; + + case CommandActions.CREATE: + handleCreateCommand(); + return true; + + case CommandActions.DELETE: + handleDeleteCommand(); + return true; + + case CommandActions.UPDATE: + handleUpdateCommand(); + return true; + + default: + return false; + } + } + + /** Handles the "DETAILS" command. */ + private void handleDetailsCommand() { + if (line.hasOption(GravitinoOptions.AUDIT)) { + gravitinoCommandLine + .newColumnAudit(url, ignore, metalake, catalog, schema, table, column) + .validate() + .handle(); + } else { + System.err.println(ErrorMessages.UNSUPPORTED_ACTION); + Main.exit(-1); + } + } + + /** Handles the "CREATE" command. */ + private void handleCreateCommand() { + String datatype = line.getOptionValue(GravitinoOptions.DATATYPE); + String comment = line.getOptionValue(GravitinoOptions.COMMENT); + String position = line.getOptionValue(GravitinoOptions.POSITION); + boolean nullable = + !line.hasOption(GravitinoOptions.NULL) + || line.getOptionValue(GravitinoOptions.NULL).equals("true"); + boolean autoIncrement = + line.hasOption(GravitinoOptions.AUTO) + && line.getOptionValue(GravitinoOptions.AUTO).equals("true"); + String defaultValue = line.getOptionValue(GravitinoOptions.DEFAULT); + + gravitinoCommandLine + .newAddColumn( + url, + ignore, + metalake, + catalog, + schema, + table, + column, + datatype, + comment, + position, + nullable, + autoIncrement, + defaultValue) + .validate() + .handle(); + } + + /** Handles the "DELETE" command. */ + private void handleDeleteCommand() { + gravitinoCommandLine + .newDeleteColumn(url, ignore, metalake, catalog, schema, table, column) + .validate() + .handle(); + } + + /** Handles the "UPDATE" command. */ + private void handleUpdateCommand() { + if (line.hasOption(GravitinoOptions.COMMENT)) { + String comment = line.getOptionValue(GravitinoOptions.COMMENT); + gravitinoCommandLine + .newUpdateColumnComment(url, ignore, metalake, catalog, schema, table, column, comment) + .validate() + .handle(); + } + if (line.hasOption(GravitinoOptions.RENAME)) { + String newName = line.getOptionValue(GravitinoOptions.RENAME); + gravitinoCommandLine + .newUpdateColumnName(url, ignore, metalake, catalog, schema, table, column, newName) + .validate() + .handle(); + } + if (line.hasOption(GravitinoOptions.DATATYPE) && !line.hasOption(GravitinoOptions.DEFAULT)) { + String datatype = line.getOptionValue(GravitinoOptions.DATATYPE); + gravitinoCommandLine + .newUpdateColumnDatatype(url, ignore, metalake, catalog, schema, table, column, datatype) + .validate() + .handle(); + } + if (line.hasOption(GravitinoOptions.POSITION)) { + String position = line.getOptionValue(GravitinoOptions.POSITION); + gravitinoCommandLine + .newUpdateColumnPosition(url, ignore, metalake, catalog, schema, table, column, position) + .validate() + .handle(); + } + if (line.hasOption(GravitinoOptions.NULL)) { + boolean nullable = line.getOptionValue(GravitinoOptions.NULL).equals("true"); + gravitinoCommandLine + .newUpdateColumnNullability( + url, ignore, metalake, catalog, schema, table, column, nullable) + .validate() + .handle(); + } + if (line.hasOption(GravitinoOptions.AUTO)) { + boolean autoIncrement = line.getOptionValue(GravitinoOptions.AUTO).equals("true"); + gravitinoCommandLine + .newUpdateColumnAutoIncrement( + url, ignore, metalake, catalog, schema, table, column, autoIncrement) + .validate() + .handle(); + } + if (line.hasOption(GravitinoOptions.DEFAULT)) { + String defaultValue = line.getOptionValue(GravitinoOptions.DEFAULT); + String dataType = line.getOptionValue(GravitinoOptions.DATATYPE); + gravitinoCommandLine + .newUpdateColumnDefault( + url, ignore, metalake, catalog, schema, table, column, defaultValue, dataType) + .validate() + .handle(); + } + } + + /** Handles the "LIST" command. */ + private void handleListCommand() { + gravitinoCommandLine + .newListColumns(url, ignore, metalake, catalog, schema, table) + .validate() + .handle(); + } +} diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java index 9c9ce6810ba..b883502e805 100644 --- a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java @@ -129,7 +129,7 @@ private void executeCommand() { } else if (line.hasOption(GravitinoOptions.OWNER)) { new OwnerCommandHandler(this, line, command, ignore, entity).handle(); } else if (entity.equals(CommandEntities.COLUMN)) { - handleColumnCommand(); + new ColumnCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.TABLE)) { new TableCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.SCHEMA)) { @@ -401,141 +401,6 @@ private String getOneTag(String[] tags) { return tags[0]; } - /** - * Handles the command execution for Columns based on command type and the command line options. - */ - private void handleColumnCommand() { - String url = getUrl(); - String auth = getAuth(); - String userName = line.getOptionValue(GravitinoOptions.LOGIN); - FullName name = new FullName(line); - String metalake = name.getMetalakeName(); - String catalog = name.getCatalogName(); - String schema = name.getSchemaName(); - String table = name.getTableName(); - - Command.setAuthenticationMode(auth, userName); - - List missingEntities = Lists.newArrayList(); - if (catalog == null) missingEntities.add(CommandEntities.CATALOG); - if (schema == null) missingEntities.add(CommandEntities.SCHEMA); - if (table == null) missingEntities.add(CommandEntities.TABLE); - - if (CommandActions.LIST.equals(command)) { - checkEntities(missingEntities); - newListColumns(url, ignore, metalake, catalog, schema, table).validate().handle(); - return; - } - - String column = name.getColumnName(); - if (column == null) missingEntities.add(CommandEntities.COLUMN); - checkEntities(missingEntities); - - switch (command) { - case CommandActions.DETAILS: - if (line.hasOption(GravitinoOptions.AUDIT)) { - newColumnAudit(url, ignore, metalake, catalog, schema, table, column).validate().handle(); - } else { - System.err.println(ErrorMessages.UNSUPPORTED_ACTION); - Main.exit(-1); - } - break; - - case CommandActions.CREATE: - { - String datatype = line.getOptionValue(GravitinoOptions.DATATYPE); - String comment = line.getOptionValue(GravitinoOptions.COMMENT); - String position = line.getOptionValue(GravitinoOptions.POSITION); - boolean nullable = - !line.hasOption(GravitinoOptions.NULL) - || line.getOptionValue(GravitinoOptions.NULL).equals("true"); - boolean autoIncrement = - line.hasOption(GravitinoOptions.AUTO) - && line.getOptionValue(GravitinoOptions.AUTO).equals("true"); - String defaultValue = line.getOptionValue(GravitinoOptions.DEFAULT); - - newAddColumn( - url, - ignore, - metalake, - catalog, - schema, - table, - column, - datatype, - comment, - position, - nullable, - autoIncrement, - defaultValue) - .validate() - .handle(); - break; - } - - case CommandActions.DELETE: - newDeleteColumn(url, ignore, metalake, catalog, schema, table, column).validate().handle(); - break; - - case CommandActions.UPDATE: - { - if (line.hasOption(GravitinoOptions.COMMENT)) { - String comment = line.getOptionValue(GravitinoOptions.COMMENT); - newUpdateColumnComment(url, ignore, metalake, catalog, schema, table, column, comment) - .validate() - .handle(); - } - if (line.hasOption(GravitinoOptions.RENAME)) { - String newName = line.getOptionValue(GravitinoOptions.RENAME); - newUpdateColumnName(url, ignore, metalake, catalog, schema, table, column, newName) - .validate() - .handle(); - } - if (line.hasOption(GravitinoOptions.DATATYPE) - && !line.hasOption(GravitinoOptions.DEFAULT)) { - String datatype = line.getOptionValue(GravitinoOptions.DATATYPE); - newUpdateColumnDatatype(url, ignore, metalake, catalog, schema, table, column, datatype) - .validate() - .handle(); - } - if (line.hasOption(GravitinoOptions.POSITION)) { - String position = line.getOptionValue(GravitinoOptions.POSITION); - newUpdateColumnPosition(url, ignore, metalake, catalog, schema, table, column, position) - .validate() - .handle(); - } - if (line.hasOption(GravitinoOptions.NULL)) { - boolean nullable = line.getOptionValue(GravitinoOptions.NULL).equals("true"); - newUpdateColumnNullability( - url, ignore, metalake, catalog, schema, table, column, nullable) - .validate() - .handle(); - } - if (line.hasOption(GravitinoOptions.AUTO)) { - boolean autoIncrement = line.getOptionValue(GravitinoOptions.AUTO).equals("true"); - newUpdateColumnAutoIncrement( - url, ignore, metalake, catalog, schema, table, column, autoIncrement) - .validate() - .handle(); - } - if (line.hasOption(GravitinoOptions.DEFAULT)) { - String defaultValue = line.getOptionValue(GravitinoOptions.DEFAULT); - String dataType = line.getOptionValue(GravitinoOptions.DATATYPE); - newUpdateColumnDefault( - url, ignore, metalake, catalog, schema, table, column, defaultValue, dataType) - .validate() - .handle(); - } - break; - } - - default: - System.err.println(ErrorMessages.UNSUPPORTED_ACTION); - Main.exit(-1); - break; - } - } - private void handleHelpCommand() { String helpFile = entity.toLowerCase() + "_help.txt"; From 0b9d89bba96527037f4bbf76c6fc9f814d5e3660 Mon Sep 17 00:00:00 2001 From: SekiXu Date: Mon, 13 Jan 2025 06:55:42 +0800 Subject: [PATCH 03/40] [#6150] improve(CLI): Refactor user commands in Gavitino CLI (#6193) ### What changes were proposed in this pull request? Refactor user commands and Base class in Gavitino CLI. ### Why are the changes needed? Fix: #6150 ### Does this PR introduce any user-facing change? No ### How was this patch tested? local test. --- .../gravitino/cli/GravitinoCommandLine.java | 63 +------ .../gravitino/cli/UserCommandHandler.java | 174 ++++++++++++++++++ 2 files changed, 175 insertions(+), 62 deletions(-) create mode 100644 clients/cli/src/main/java/org/apache/gravitino/cli/UserCommandHandler.java diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java index b883502e805..21d3ed176cb 100644 --- a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java @@ -143,7 +143,7 @@ private void executeCommand() { } else if (entity.equals(CommandEntities.FILESET)) { handleFilesetCommand(); } else if (entity.equals(CommandEntities.USER)) { - handleUserCommand(); + new UserCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.GROUP)) { new GroupCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.TAG)) { @@ -240,67 +240,6 @@ private void handleMetalakeCommand() { } } - /** Handles the command execution for Users based on command type and the command line options. */ - protected void handleUserCommand() { - String url = getUrl(); - String auth = getAuth(); - String userName = line.getOptionValue(GravitinoOptions.LOGIN); - FullName name = new FullName(line); - String metalake = name.getMetalakeName(); - String user = line.getOptionValue(GravitinoOptions.USER); - - Command.setAuthenticationMode(auth, userName); - - if (user == null && !CommandActions.LIST.equals(command)) { - System.err.println(ErrorMessages.MISSING_USER); - Main.exit(-1); - } - - switch (command) { - case CommandActions.DETAILS: - if (line.hasOption(GravitinoOptions.AUDIT)) { - newUserAudit(url, ignore, metalake, user).validate().handle(); - } else { - newUserDetails(url, ignore, metalake, user).validate().handle(); - } - break; - - case CommandActions.LIST: - newListUsers(url, ignore, metalake).validate().handle(); - break; - - case CommandActions.CREATE: - newCreateUser(url, ignore, metalake, user).validate().handle(); - break; - - case CommandActions.DELETE: - boolean force = line.hasOption(GravitinoOptions.FORCE); - newDeleteUser(url, ignore, force, metalake, user).validate().handle(); - break; - - case CommandActions.REVOKE: - String[] revokeRoles = line.getOptionValues(GravitinoOptions.ROLE); - for (String role : revokeRoles) { - newRemoveRoleFromUser(url, ignore, metalake, user, role).validate().handle(); - } - System.out.printf("Remove roles %s from user %s%n", COMMA_JOINER.join(revokeRoles), user); - break; - - case CommandActions.GRANT: - String[] grantRoles = line.getOptionValues(GravitinoOptions.ROLE); - for (String role : grantRoles) { - newAddRoleToUser(url, ignore, metalake, user, role).validate().handle(); - } - System.out.printf("Grant roles %s to user %s%n", COMMA_JOINER.join(grantRoles), user); - break; - - default: - System.err.println(ErrorMessages.UNSUPPORTED_COMMAND); - Main.exit(-1); - break; - } - } - /** Handles the command execution for Tags based on command type and the command line options. */ protected void handleTagCommand() { String url = getUrl(); diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/UserCommandHandler.java b/clients/cli/src/main/java/org/apache/gravitino/cli/UserCommandHandler.java new file mode 100644 index 00000000000..9a8374ec342 --- /dev/null +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/UserCommandHandler.java @@ -0,0 +1,174 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.cli; + +import org.apache.commons.cli.CommandLine; +import org.apache.gravitino.cli.commands.Command; + +/** Handles the command execution for Users based on command type and the command line options. */ +public class UserCommandHandler extends CommandHandler { + private final GravitinoCommandLine gravitinoCommandLine; + private final CommandLine line; + private final String command; + private final boolean ignore; + private final String url; + private final FullName name; + private final String metalake; + private String user; + + /** + * Constructs a {@link UserCommandHandler} instance. + * + * @param gravitinoCommandLine The Gravitino command line instance. + * @param line The command line arguments. + * @param command The command to execute. + * @param ignore Ignore server version mismatch. + */ + public UserCommandHandler( + GravitinoCommandLine gravitinoCommandLine, CommandLine line, String command, boolean ignore) { + this.gravitinoCommandLine = gravitinoCommandLine; + this.line = line; + this.command = command; + this.ignore = ignore; + + this.url = getUrl(line); + this.name = new FullName(line); + this.metalake = name.getMetalakeName(); + } + + /** Handles the command execution logic based on the provided command. */ + @Override + protected void handle() { + String userName = line.getOptionValue(GravitinoOptions.LOGIN); + Command.setAuthenticationMode(getAuth(line), userName); + + user = line.getOptionValue(GravitinoOptions.USER); + + if (user == null && !CommandActions.LIST.equals(command)) { + System.err.println(ErrorMessages.MISSING_USER); + Main.exit(-1); + } + + if (!executeCommand()) { + System.err.println(ErrorMessages.UNSUPPORTED_COMMAND); + Main.exit(-1); + } + } + + /** + * Executes the specific command based on the command type. + * + * @return true if the command is supported, false otherwise + */ + private boolean executeCommand() { + switch (command) { + case CommandActions.DETAILS: + handleDetailsCommand(); + return true; + + case CommandActions.LIST: + handleListCommand(); + return true; + + case CommandActions.CREATE: + handleCreateCommand(); + return true; + + case CommandActions.DELETE: + handleDeleteCommand(); + return true; + + case CommandActions.REVOKE: + handleRevokeCommand(); + return true; + + case CommandActions.GRANT: + handleGrantCommand(); + return true; + + default: + return false; + } + } + + /** Handles the "LIST" command. */ + private void handleListCommand() { + this.gravitinoCommandLine + .newListUsers(this.url, this.ignore, this.metalake) + .validate() + .handle(); + } + + /** Handles the "DETAILS" command. */ + private void handleDetailsCommand() { + if (line.hasOption(GravitinoOptions.AUDIT)) { + this.gravitinoCommandLine + .newUserAudit(this.url, this.ignore, this.metalake, user) + .validate() + .handle(); + } else { + this.gravitinoCommandLine + .newUserDetails(this.url, this.ignore, this.metalake, user) + .validate() + .handle(); + } + } + + /** Handles the "CREATE" command. */ + private void handleCreateCommand() { + this.gravitinoCommandLine + .newCreateUser(this.url, this.ignore, this.metalake, user) + .validate() + .handle(); + } + + /** Handles the "DELETE" command. */ + private void handleDeleteCommand() { + boolean force = line.hasOption(GravitinoOptions.FORCE); + this.gravitinoCommandLine + .newDeleteUser(this.url, this.ignore, force, this.metalake, user) + .validate() + .handle(); + } + + /** Handles the "REVOKE" command. */ + private void handleRevokeCommand() { + String[] revokeRoles = line.getOptionValues(GravitinoOptions.ROLE); + for (String role : revokeRoles) { + this.gravitinoCommandLine + .newRemoveRoleFromUser(this.url, this.ignore, this.metalake, user, role) + .validate() + .handle(); + } + System.out.printf("Remove roles %s from user %s%n", COMMA_JOINER.join(revokeRoles), user); + } + + /** Handles the "GRANT" command. */ + private void handleGrantCommand() { + String[] grantRoles = line.getOptionValues(GravitinoOptions.ROLE); + for (String role : grantRoles) { + this.gravitinoCommandLine + .newAddRoleToUser(this.url, this.ignore, this.metalake, user, role) + .validate() + .handle(); + } + System.out.printf("Add roles %s to user %s%n", COMMA_JOINER.join(grantRoles), user); + } +} From bbe3bcf1fdd31ea33a15a940ed57590987250491 Mon Sep 17 00:00:00 2001 From: FANNG Date: Mon, 13 Jan 2025 08:51:08 +0800 Subject: [PATCH 04/40] [MINOR] bump version to 0.9.0-incubating-snapshot (#6094) ### What changes were proposed in this pull request? bump version to 0.9.0-incubating-snapshot ### Why are the changes needed? change project version after cut branch-0.8 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? no, just change version --- clients/client-python/setup.py | 2 +- docs/index.md | 8 ++++---- docs/manage-relational-metadata-using-gravitino.md | 10 +++++----- docs/open-api/openapi.yaml | 2 +- gradle.properties | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/clients/client-python/setup.py b/clients/client-python/setup.py index 108ad0226f1..878e74a1d00 100644 --- a/clients/client-python/setup.py +++ b/clients/client-python/setup.py @@ -27,7 +27,7 @@ setup( name="apache-gravitino", description="Python lib/client for Apache Gravitino", - version="0.8.0.dev0", + version="0.9.0.dev0", long_description=long_description, long_description_content_type="text/markdown", author="Apache Software Foundation", diff --git a/docs/index.md b/docs/index.md index 401e6c1d0a9..4a9c43131d9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -61,8 +61,8 @@ REST API and the Java SDK. You can use either to manage metadata. See Also, you can find the complete REST API definition in [Gravitino Open API](./api/rest/gravitino-rest-api), -Java SDK definition in [Gravitino Java doc](pathname:///docs/0.8.0-incubating-SNAPSHOT/api/java/index.html), -and Python SDK definition in [Gravitino Python doc](pathname:///docs/0.8.0-incubating-SNAPSHOT/api/python/index.html). +Java SDK definition in [Gravitino Java doc](pathname:///docs/0.9.0-incubating-SNAPSHOT/api/java/index.html), +and Python SDK definition in [Gravitino Python doc](pathname:///docs/0.9.0-incubating-SNAPSHOT/api/python/index.html). Gravitino also provides a web UI to manage the metadata. Visit the web UI in the browser via `http://:8090`. See [Gravitino web UI](./webui.md) for details. @@ -178,8 +178,8 @@ Gravitino provides security configurations for Gravitino, including HTTPS, authe ### Programming guides * [Gravitino Open API](./api/rest/gravitino-rest-api): provides the complete Open API definition of Gravitino. -* [Gravitino Java doc](pathname:///docs/0.8.0-incubating-SNAPSHOT/api/java/index.html): provides the Javadoc for the Gravitino API. -* [Gravitino Python doc](pathname:///docs/0.8.0-incubating-SNAPSHOT/api/python/index.html): provides the Python doc for the Gravitino API. +* [Gravitino Java doc](pathname:///docs/0.9.0-incubating-SNAPSHOT/api/java/index.html): provides the Javadoc for the Gravitino API. +* [Gravitino Python doc](pathname:///docs/0.9.0-incubating-SNAPSHOT/api/python/index.html): provides the Python doc for the Gravitino API. ### Development guides diff --git a/docs/manage-relational-metadata-using-gravitino.md b/docs/manage-relational-metadata-using-gravitino.md index 352a8de2935..b3d28e95128 100644 --- a/docs/manage-relational-metadata-using-gravitino.md +++ b/docs/manage-relational-metadata-using-gravitino.md @@ -909,7 +909,7 @@ The following types that Gravitino supports: | Union | `Types.UnionType.of([type1, type2, ...])` | `{"type": "union", "types": [type JSON, ...]}` | Union type, indicates a union of types | | UUID | `Types.UUIDType.get()` | `uuid` | UUID type, indicates a universally unique identifier | -The related java doc is [here](pathname:///docs/0.8.0-incubating-SNAPSHOT/api/java/org/apache/gravitino/rel/types/Type.html). +The related java doc is [here](pathname:///docs/0.9.0-incubating-SNAPSHOT/api/java/org/apache/gravitino/rel/types/Type.html). ##### External type @@ -1022,10 +1022,10 @@ In addition to the basic settings, Gravitino supports the following features: | Feature | Description | Java doc | |---------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------| -| Table partitioning | Equal to `PARTITION BY` in Apache Hive, It is a partitioning strategy that is used to split a table into parts based on partition keys. Some table engine may not support this feature | [Partition](pathname:///docs/0.8.0-incubating-SNAPSHOT/api/java/org/apache/gravitino/dto/rel/partitioning/Partitioning.html) | -| Table distribution | Equal to `CLUSTERED BY` in Apache Hive, distribution a.k.a (Clustering) is a technique to split the data into more manageable files/parts, (By specifying the number of buckets to create). The value of the distribution column will be hashed by a user-defined number into buckets. | [Distribution](pathname:///docs/0.8.0-incubating-SNAPSHOT/api/java/org/apache/gravitino/rel/expressions/distributions/Distribution.html) | -| Table sort ordering | Equal to `SORTED BY` in Apache Hive, sort ordering is a method to sort the data in specific ways such as by a column or a function, and then store table data. it will highly improve the query performance under certain scenarios. | [SortOrder](pathname:///docs/0.8.0-incubating-SNAPSHOT/api/java/org/apache/gravitino/rel/expressions/sorts/SortOrder.html) | -| Table indexes | Equal to `KEY/INDEX` in MySQL , unique key enforces uniqueness of values in one or more columns within a table. It ensures that no two rows have identical values in specified columns, thereby facilitating data integrity and enabling efficient data retrieval and manipulation operations. | [Index](pathname:///docs/0.8.0-incubating-SNAPSHOT/api/java/org/apache/gravitino/rel/indexes/Index.html) | +| Table partitioning | Equal to `PARTITION BY` in Apache Hive, It is a partitioning strategy that is used to split a table into parts based on partition keys. Some table engine may not support this feature | [Partition](pathname:///docs/0.9.0-incubating-SNAPSHOT/api/java/org/apache/gravitino/dto/rel/partitioning/Partitioning.html) | +| Table distribution | Equal to `CLUSTERED BY` in Apache Hive, distribution a.k.a (Clustering) is a technique to split the data into more manageable files/parts, (By specifying the number of buckets to create). The value of the distribution column will be hashed by a user-defined number into buckets. | [Distribution](pathname:///docs/0.9.0-incubating-SNAPSHOT/api/java/org/apache/gravitino/rel/expressions/distributions/Distribution.html) | +| Table sort ordering | Equal to `SORTED BY` in Apache Hive, sort ordering is a method to sort the data in specific ways such as by a column or a function, and then store table data. it will highly improve the query performance under certain scenarios. | [SortOrder](pathname:///docs/0.9.0-incubating-SNAPSHOT/api/java/org/apache/gravitino/rel/expressions/sorts/SortOrder.html) | +| Table indexes | Equal to `KEY/INDEX` in MySQL , unique key enforces uniqueness of values in one or more columns within a table. It ensures that no two rows have identical values in specified columns, thereby facilitating data integrity and enabling efficient data retrieval and manipulation operations. | [Index](pathname:///docs/0.9.0-incubating-SNAPSHOT/api/java/org/apache/gravitino/rel/indexes/Index.html) | For more information, please see the related document on [partitioning, bucketing, sorting, and indexes](table-partitioning-bucketing-sort-order-indexes.md). diff --git a/docs/open-api/openapi.yaml b/docs/open-api/openapi.yaml index f39a90f55f5..4405f130135 100644 --- a/docs/open-api/openapi.yaml +++ b/docs/open-api/openapi.yaml @@ -22,7 +22,7 @@ info: license: name: Apache 2.0 url: https://www.apache.org/licenses/LICENSE-2.0.html - version: 0.8.0-incubating-SNAPSHOT + version: 0.9.0-incubating-SNAPSHOT description: | Defines the specification for the first version of the Gravitino REST API. diff --git a/gradle.properties b/gradle.properties index cc1b9393018..4049f73840b 100644 --- a/gradle.properties +++ b/gradle.properties @@ -23,7 +23,7 @@ org.gradle.caching=true org.gradle.jvmargs=-Xmx4g # version that is going to be updated automatically by releases -version = 0.8.0-incubating-SNAPSHOT +version = 0.9.0-incubating-SNAPSHOT # sonatype credentials SONATYPE_USER = admin From 2d0cda5c43215688e7106892ae3f9d5bbbefbe87 Mon Sep 17 00:00:00 2001 From: Justin Mclean Date: Mon, 13 Jan 2025 13:13:59 +1100 Subject: [PATCH 05/40] [#6194] Add python client license and notice file (#6195) ## What changes were proposed in this pull request? Add license and notice files. ### Why are the changes needed? As the release's content is different to that of Gravitino. Fix: #6194 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Tested locally. --- clients/client-python/LICENSE | 214 +++++++++++++++++++++ clients/client-python/NOTICE | 8 + clients/client-python/build.gradle.kts | 6 - clients/client-python/licenses/kylinpy.txt | 21 ++ 4 files changed, 243 insertions(+), 6 deletions(-) create mode 100644 clients/client-python/LICENSE create mode 100644 clients/client-python/NOTICE create mode 100644 clients/client-python/licenses/kylinpy.txt diff --git a/clients/client-python/LICENSE b/clients/client-python/LICENSE new file mode 100644 index 00000000000..42c856d10b1 --- /dev/null +++ b/clients/client-python/LICENSE @@ -0,0 +1,214 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + The Web UI also bundles various third-party components also under + different licenses, please see web/LICENSE for these. + + This product bundles various third-party components also under the + Apache Software License 2.0. + + This product bundles a third-party component under the + MIT License. + + Kyligence/kylinpy + ./client-python/gravitino/utils/http_client.py + diff --git a/clients/client-python/NOTICE b/clients/client-python/NOTICE new file mode 100644 index 00000000000..c1fde5e04e3 --- /dev/null +++ b/clients/client-python/NOTICE @@ -0,0 +1,8 @@ +Apache Gravitino (incubating) +Copyright 2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +The initial code for the Gravitino project was donated +to the ASF by Datastrato (https://datastrato.ai/) copyright 2023-2024. \ No newline at end of file diff --git a/clients/client-python/build.gradle.kts b/clients/client-python/build.gradle.kts index bebf536f6eb..af6cfcd2d9f 100644 --- a/clients/client-python/build.gradle.kts +++ b/clients/client-python/build.gradle.kts @@ -285,9 +285,6 @@ tasks { generatePypiProjectHomePage() delete("dist") copy { - from("${project.rootDir}/licenses") { into("licenses") } - from("${project.rootDir}/LICENSE.bin") { into("./") } - from("${project.rootDir}/NOTICE.bin") { into("./") } from("${project.rootDir}/DISCLAIMER_WIP.txt") { into("./") } into("${project.rootDir}/clients/client-python") rename { fileName -> @@ -301,9 +298,6 @@ tasks { doLast { delete("README.md") - delete("licenses") - delete("LICENSE") - delete("NOTICE") delete("DISCLAIMER_WIP.txt") } } diff --git a/clients/client-python/licenses/kylinpy.txt b/clients/client-python/licenses/kylinpy.txt new file mode 100644 index 00000000000..580127c7327 --- /dev/null +++ b/clients/client-python/licenses/kylinpy.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2016 Dhamu + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file From 9815cc3690652a7df06d070bf0b978b89755997d Mon Sep 17 00:00:00 2001 From: Lord of Abyss <103809695+Abyss-lord@users.noreply.github.com> Date: Mon, 13 Jan 2025 10:37:54 +0800 Subject: [PATCH 06/40] [#6069] fix(docs): Fix access-control.md (#6189) ### What changes were proposed in this pull request? Fix the wrong document information about revoke roles from role ### Why are the changes needed? Fix: #6069 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? local test. --- docs/security/access-control.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/security/access-control.md b/docs/security/access-control.md index 7e996738cb6..681ec4752d5 100644 --- a/docs/security/access-control.md +++ b/docs/security/access-control.md @@ -817,7 +817,7 @@ curl -X PUT -H "Accept: application/vnd.gravitino.v1+json" \ ```java GravitinoClient client = ... -Group group = client.grantRolesToGroup(Lists.newList("role1"), "group1"); +Group group = client.revokeRolesFromGroup(Lists.newList("role1"), "group1"); ``` From d9ae375d211c64ae6318c32add11e476c901c5f7 Mon Sep 17 00:00:00 2001 From: Yuhui Date: Mon, 13 Jan 2025 11:15:02 +0800 Subject: [PATCH 07/40] [#5533] fix (trino-connector): Fix the exception of ArrayIndexOutOfBoundsException when execute COMMENT COLUMN command (#6182) ### What changes were proposed in this pull request? Fix the exception of ArrayIndexOutOfBoundsException when handle error message of IllegalArgumentException ### Why are the changes needed? Fix: #5533 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? IT --- .../integration/test/TrinoQueryIT.java | 24 +++++++++---------- .../integration/test/TrinoQueryRunner.java | 19 ++++++++------- .../testsets/jdbc-mysql/00002_alter_table.sql | 2 ++ .../testsets/jdbc-mysql/00002_alter_table.txt | 2 ++ .../trino/connector/GravitinoErrorCode.java | 6 +++++ .../catalog/CatalogConnectorMetadata.java | 3 +-- 6 files changed, 34 insertions(+), 22 deletions(-) diff --git a/trino-connector/integration-test/src/test/java/org/apache/gravitino/trino/connector/integration/test/TrinoQueryIT.java b/trino-connector/integration-test/src/test/java/org/apache/gravitino/trino/connector/integration/test/TrinoQueryIT.java index d9940de4573..64e49723a6e 100644 --- a/trino-connector/integration-test/src/test/java/org/apache/gravitino/trino/connector/integration/test/TrinoQueryIT.java +++ b/trino-connector/integration-test/src/test/java/org/apache/gravitino/trino/connector/integration/test/TrinoQueryIT.java @@ -55,15 +55,15 @@ public class TrinoQueryIT extends TrinoQueryITBase { private static final Logger LOG = LoggerFactory.getLogger(TrinoQueryIT.class); - static String testsetsDir = ""; - AtomicInteger passCount = new AtomicInteger(0); - AtomicInteger totalCount = new AtomicInteger(0); - static boolean exitOnFailed = true; + protected static String testsetsDir; + protected AtomicInteger passCount = new AtomicInteger(0); + protected AtomicInteger totalCount = new AtomicInteger(0); + protected static boolean exitOnFailed = true; // key: tester name, value: tester result - private static Map allTestStatus = new TreeMap<>(); + private static final Map allTestStatus = new TreeMap<>(); - private static int testParallelism = 2; + private static final int testParallelism = 2; static Map queryParams = new HashMap<>(); @@ -275,8 +275,8 @@ void executeSqlFileWithCheckResult( * actual result matches the query failed result. 3. The expected result is a regular expression, * and the actual result matches the regular expression. * - * @param expectResult - * @param result + * @param expectResult the expected result + * @param result the actual result * @return false if the expected result is empty or the actual result does not match the expected. * For {@literal } case, return true if the actual result is empty. For {@literal * } case, replace the placeholder with "^Query \\w+ failed.*: " and do match. @@ -338,7 +338,7 @@ static boolean match(String expectResult, String result) { @Test public void testSql() throws Exception { ExecutorService executor = Executors.newFixedThreadPool(testParallelism); - CompletionService completionService = new ExecutorCompletionService<>(executor); + CompletionService completionService = new ExecutorCompletionService<>(executor); String[] testSetNames = Arrays.stream(TrinoQueryITBase.listDirectory(testsetsDir)) @@ -357,7 +357,7 @@ public void testSql() throws Exception { public void testSql(String testSetDirName, String catalog, String testerPrefix) throws Exception { ExecutorService executor = Executors.newFixedThreadPool(testParallelism); - CompletionService completionService = new ExecutorCompletionService<>(executor); + CompletionService completionService = new ExecutorCompletionService<>(executor); totalCount.addAndGet(getTesterCount(testSetDirName, catalog, testerPrefix)); List> futures = @@ -369,7 +369,7 @@ public void testSql(String testSetDirName, String catalog, String testerPrefix) private void waitForCompleted( ExecutorService executor, - CompletionService completionService, + CompletionService completionService, List> allFutures) { for (int i = 0; i < allFutures.size(); i++) { try { @@ -405,7 +405,7 @@ public String generateTestStatus() { } public List> runOneTestset( - CompletionService completionService, + CompletionService completionService, String testSetDirName, String catalog, String testerFilter) diff --git a/trino-connector/integration-test/src/test/java/org/apache/gravitino/trino/connector/integration/test/TrinoQueryRunner.java b/trino-connector/integration-test/src/test/java/org/apache/gravitino/trino/connector/integration/test/TrinoQueryRunner.java index 0e794e45ab5..7c3001a731e 100644 --- a/trino-connector/integration-test/src/test/java/org/apache/gravitino/trino/connector/integration/test/TrinoQueryRunner.java +++ b/trino-connector/integration-test/src/test/java/org/apache/gravitino/trino/connector/integration/test/TrinoQueryRunner.java @@ -42,9 +42,9 @@ class TrinoQueryRunner { private static final Logger LOG = LoggerFactory.getLogger(TrinoQueryRunner.class); - private QueryRunner queryRunner; - private Terminal terminal; - private URI uri; + private final QueryRunner queryRunner; + private final Terminal terminal; + private final URI uri; TrinoQueryRunner(String trinoUri) throws Exception { this.uri = new URI(trinoUri); @@ -92,10 +92,11 @@ String runQuery(String query) { String runQueryOnce(String query) { Query queryResult = queryRunner.startQuery(query); StringOutputStream outputStream = new StringOutputStream(); + StringOutputStream errorStream = new StringOutputStream(); queryResult.renderOutput( this.terminal, new PrintStream(outputStream), - new PrintStream(outputStream), + new PrintStream(errorStream), CSV, Optional.of(""), false); @@ -109,17 +110,19 @@ String runQueryOnce(String query) { session = builder.build(); queryRunner.setSession(session); } - return outputStream.toString(); + + // Avoid the IDE capturing the error message as failure + String err_message = errorStream.toString().replace("\nCaused by:", "\n-Caused by:"); + String out_message = outputStream.toString(); + return err_message + out_message; } - boolean stop() { + void stop() { try { queryRunner.close(); terminal.close(); - return true; } catch (Exception e) { LOG.error("Failed to stop query runner", e); - return false; } } } diff --git a/trino-connector/integration-test/src/test/resources/trino-ci-testset/testsets/jdbc-mysql/00002_alter_table.sql b/trino-connector/integration-test/src/test/resources/trino-ci-testset/testsets/jdbc-mysql/00002_alter_table.sql index b3af09a6580..e8058cde4ef 100644 --- a/trino-connector/integration-test/src/test/resources/trino-ci-testset/testsets/jdbc-mysql/00002_alter_table.sql +++ b/trino-connector/integration-test/src/test/resources/trino-ci-testset/testsets/jdbc-mysql/00002_alter_table.sql @@ -37,6 +37,8 @@ show create table gt_mysql.gt_db1.tb01; alter table gt_mysql.gt_db1.tb01 add column address varchar(200) not null comment 'address of users'; show create table gt_mysql.gt_db1.tb01; +COMMENT ON COLUMN gt_mysql.gt_db1.tb01.city IS NULL; + drop table gt_mysql.gt_db1.tb01; drop schema gt_mysql.gt_db1; diff --git a/trino-connector/integration-test/src/test/resources/trino-ci-testset/testsets/jdbc-mysql/00002_alter_table.txt b/trino-connector/integration-test/src/test/resources/trino-ci-testset/testsets/jdbc-mysql/00002_alter_table.txt index 3aa3144935c..b3b5366b9a6 100644 --- a/trino-connector/integration-test/src/test/resources/trino-ci-testset/testsets/jdbc-mysql/00002_alter_table.txt +++ b/trino-connector/integration-test/src/test/resources/trino-ci-testset/testsets/jdbc-mysql/00002_alter_table.txt @@ -104,6 +104,8 @@ WITH ( engine = 'InnoDB' )" + "newComment" field is required and cannot be empty + DROP TABLE DROP SCHEMA diff --git a/trino-connector/trino-connector/src/main/java/org/apache/gravitino/trino/connector/GravitinoErrorCode.java b/trino-connector/trino-connector/src/main/java/org/apache/gravitino/trino/connector/GravitinoErrorCode.java index 5741e4427bd..e47675d4574 100644 --- a/trino-connector/trino-connector/src/main/java/org/apache/gravitino/trino/connector/GravitinoErrorCode.java +++ b/trino-connector/trino-connector/src/main/java/org/apache/gravitino/trino/connector/GravitinoErrorCode.java @@ -23,6 +23,7 @@ import io.trino.spi.ErrorCode; import io.trino.spi.ErrorCodeSupplier; import io.trino.spi.ErrorType; +import java.util.List; public enum GravitinoErrorCode implements ErrorCodeSupplier { GRAVITINO_UNSUPPORTED_TRINO_VERSION(0, EXTERNAL), @@ -64,4 +65,9 @@ public enum GravitinoErrorCode implements ErrorCodeSupplier { public ErrorCode toErrorCode() { return errorCode; } + + public static String toSimpleErrorMessage(Exception e) { + List lines = e.getMessage().lines().toList(); + return lines.size() > 1 ? lines.get(0) + lines.get(1) : lines.get(0); + } } diff --git a/trino-connector/trino-connector/src/main/java/org/apache/gravitino/trino/connector/catalog/CatalogConnectorMetadata.java b/trino-connector/trino-connector/src/main/java/org/apache/gravitino/trino/connector/catalog/CatalogConnectorMetadata.java index 759a4de0889..3bb61f977e5 100644 --- a/trino-connector/trino-connector/src/main/java/org/apache/gravitino/trino/connector/catalog/CatalogConnectorMetadata.java +++ b/trino-connector/trino-connector/src/main/java/org/apache/gravitino/trino/connector/catalog/CatalogConnectorMetadata.java @@ -190,8 +190,7 @@ private void applyAlter(SchemaTableName tableName, TableChange... change) { // TODO yuhui need improve get the error message. From IllegalArgumentException. // At present, the IllegalArgumentException cannot get the error information clearly from the // Gravitino server. - String message = - e.getMessage().lines().toList().get(0) + e.getMessage().lines().toList().get(1); + String message = GravitinoErrorCode.toSimpleErrorMessage(e); throw new TrinoException(GravitinoErrorCode.GRAVITINO_ILLEGAL_ARGUMENT, message, e); } } From 1fa31013a40b7a2d98988b29370e12a6b7abf94e Mon Sep 17 00:00:00 2001 From: Justin Mclean Date: Mon, 13 Jan 2025 15:12:34 +1100 Subject: [PATCH 08/40] [Minor] Update command usage and add usage tracker in Gravitino CLI (#6137) ### What changes were proposed in this pull request? Update command usage and add usage tracker ### Why are the changes needed? So everything is up to date and we can see many many people look up the CLI docs. Fix: # N/A ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Tested locally. --- docs/cli.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 0cc7dee4af9..0598a36e034 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -30,13 +30,17 @@ The general structure for running commands with the Gravitino CLI is `gcli entit usage: gcli [metalake|catalog|schema|model|table|column|user|group|tag|topic|fileset] [list|details|create|delete|update|set|remove|properties|revoke|grant] [options] Options usage: gcli - -a,--audit display audit information + -a,--audit display audit information + --alias model aliases + --all all operation for --enable --auto column value auto-increments (true/false) -c,--comment entity comment --columnfile CSV file describing columns -d,--distribution display distribution information --datatype column data type --default default column value + --disable disable entities + --enable enable entities -f,--force force operation -g,--group group name -h,--help command help information @@ -52,6 +56,7 @@ The general structure for running commands with the Gravitino CLI is `gcli entit -p,--properties property name/value pairs --partition display partition information --position position of column + --privilege privilege(s) -r,--role role name --rename new entity name -s,--server Gravitino server version @@ -59,6 +64,7 @@ The general structure for running commands with the Gravitino CLI is `gcli entit --sortorder display sortorder information -t,--tag tag name -u,--url Gravitino URL (default: http://localhost:8090) + --uri model version artifact -v,--version Gravitino client version -V,--value property value -x,--index display index information @@ -950,4 +956,6 @@ gcli --simple ```bash gcli --simple --login userName -``` \ No newline at end of file +``` + + \ No newline at end of file From b2b2338d52003eceaa2e8ee959b73baf5c32c72a Mon Sep 17 00:00:00 2001 From: Qiming Teng Date: Mon, 13 Jan 2025 13:54:20 +0800 Subject: [PATCH 09/40] [doc] Revise the glossary documentation (#5837) ### What changes were proposed in this pull request? This PR fixes the glossary docs. ### Why are the changes needed? The glossary is reordered for quick reference. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? N/A --- docs/glossary.md | 384 ++++++++++++++++++++++++++++------------------- 1 file changed, 226 insertions(+), 158 deletions(-) diff --git a/docs/glossary.md b/docs/glossary.md index 83e97d915aa..3b42a6c7734 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -4,41 +4,180 @@ date: 2023-11-28 license: "This software is licensed under the Apache License version 2." --- +## API + +- Application Programming Interface, defining the methods and protocols for interacting with a server. + +## AWS + +- Amazon Web Services, a cloud computing platform provided by Amazon. + +## AWS Glue + +- A compatible implementation of the Hive Metastore Service (HMS). + +## GPG/GnuPG + +- Gnu Privacy Guard or GnuPG is an open-source implementation of the OpenPGP standard. + It is usually used for encrypting and signing files and emails. + +## HDFS + +- **HDFS** (Hadoop Distributed File System) is an open-source distributed file system. + It is a key component of the Apache Hadoop ecosystem. + HDFS is designed as a distributed storage solution to store and process large-scale datasets. + It features high reliability, fault tolerance, and excellent performance. + +## HTTP port + +- The port number on which a server listens for incoming connections. + +## IP address + +- Internet Protocol address, a numerical label assigned to each device in a computer network. + +## JDBC + +- Java Database Connectivity, an API for connecting Java applications to relational databases. + +## JDBC URI + +- The JDBC connection address specified in the catalog configuration. + It usually includes components such as the database type, host, port, and database name. + +## JDK + +- The software development kit for the Java programming language. + A JDK provides tools for compiling, debugging, and running Java applications. + +## JMX + +- Java Management Extensions provides tools for managing and monitoring Java applications. + +## JSON + +- JavaScript Object Notation, a lightweight data interchange format. + +## JSON Web Token + +- See [JWT](#jwt). + +## JVM + +- A virtual machine that enables a computer to run Java applications. + A JVM implements an abstract machine that is different from the underlying hardware. + +## JVM instrumentation + +- The process of adding monitoring and management capabilities to the [JVM](#jvm). + The purpose of instrumentation is mainly for the collection of performance metrics. + +## JVM metrics + +- Metrics related to the performance and behavior of the [Java Virtual Machine](#jvm). + Some valuable metrics are memory usage, garbage collection, and buffer pool metrics. + +## JWT + +- A compact, URL-safe representation for claims between two parties. + +## KEYS file + +- A file containing public keys used to sign previous releases, necessary for verifying signatures. + +## PGP signature + +- A digital signature generated using the Pretty Good Privacy (PGP) algorithm. + The signature is typically used to validate the authenticity of a file. + +## REST + +- A set of architectural principles for designing networked applications. + +## REST API + +- Representational State Transfer (REST) Application Programming Interface. + A set of rules and conventions for building and interacting with Web services using standard HTTP methods. + +## SHA256 checksum + +- A cryptographic hash function used to verify the integrity of files. + +## SHA256 checksum file + +- A file containing the SHA256 hash value of another file, used for verification purposes. + +## SQL + +- A programming language used to manage and manipulate relational databases. + +## SSH + +- Secure Shell, a cryptographic network protocol used for secure communication over a computer network. + +## URI + +- Uniform Resource Identifier, a string that identifies the name or resource on the internet. + +## YAML + +- YAML Ain't Markup Language, a human-readable file format often used for structured data. + +## Amazon Elastic Block Store (EBS) + +- A scalable block storage service provided by Amazon Web Services (AWS). + +## Apache Gravitino + +- An open-source software platform initially created by Datastrato. + It is designed for high-performance, geo-distributed, and federated metadata lakes. + Gravitino can manage metadata directly in different sources, types, and regions, + providing data and AI assets with unified metadata access. + +## Apache Gravitino configuration file (gravitino.conf) + +- The configuration file for the Gravitino server, located in the `conf` directory. + It follows the standard properties file format and contains settings for the Gravitino server. + ## Apache Hadoop - An open-source distributed storage and processing framework. ## Apache Hive -- An open-source data warehousing and SQL-like query language software project for managing and querying large datasets. +- An open-source data warehousing software project. + It provides SQL-like query language for managing and querying large datasets. ## Apache Iceberg - An open-source, versioned table format for large-scale data processing. -## Apache License version 2 +## Apache Iceberg Hive catalog -- A permissive, open-source software license written by The Apache Software Foundation. +- The **Iceberg Hive catalog** is a metadata service designed for the Apache Iceberg table format. + It allows external systems to interact with an Iceberg metadata using a Hive metastore thrift client. -## API +## Apache Iceberg JDBC catalog -- Application Programming Interface, defining the methods and protocols for interacting with a server. +- The **Iceberg JDBC catalog** is a metadata service designed for the Apache Iceberg table format. + It enables external systems to interact with an Iceberg metadata service using [JDBC](#jdbc). -## Authentication mechanism +## Apache Iceberg REST catalog -- The method used to verify the identity of users and clients accessing a server. +- The **Iceberg REST Catalog** is a metadata service designed for the Apache Iceberg table format. + It enables external systems to interact with Iceberg metadata service using a [REST API](#rest-api). -## AWS +## Apache License version 2 -- Amazon Web Services, a cloud computing platform provided by Amazon. +- A permissive, open-source software license written by The Apache Software Foundation. -## AWS Glue +## Authentication mechanism -- A compatible implementation of the Hive Metastore Service (HMS). +- The method used to verify the identity of users and clients accessing a server. ## Binary distribution package -- A package containing the compiled and executable version of the software, ready for distribution and deployment. +- A software package containing the compiled executables for distribution and deployment. ## Catalog @@ -50,15 +189,12 @@ license: "This software is licensed under the Apache License version 2." ## Columns -- The individual fields or attributes of a table, specifying details such as name, data type, comment, and nullability. +- The individual fields or attributes of a table. + Each column has properties like name, data type, comment, and nullability. ## Continuous integration (CI) -- The practice of automatically building, testing, and validating code changes when they are committed to version control. - -## Contributor covenant - -- A widely-used and recognized code of conduct for open-source communities. It provides guidelines for creating a welcoming and inclusive environment for all contributors. +- The practice of automatically building and testing code changes when they are committed to version control. ## Dependencies @@ -74,51 +210,56 @@ license: "This software is licensed under the Apache License version 2." ## Docker container -- A lightweight, standalone, executable package that includes everything needed to run a piece of software, including the code, runtime, libraries, and system tools. +- A lightweight, standalone package that includes everything needed to run the software. + A container compiles an application with its dependencies and runtime for distribution. ## Docker Hub -- A cloud-based registry service for Docker containers, allowing users to share and distribute containerized applications. +- A cloud-based registry service for Docker containers. + Users can publish, browse and download containerized software using this service. ## Docker image -- A lightweight, standalone, and executable package that includes everything needed to run a piece of software, including the code, runtime, libraries, and system tools. +- A lightweight, standalone package that includes everything needed to run the software. + A Docker image typically comprises the code, runtime, libraries, and system tools. -## Docker file +## Dockerfile -- A configuration file used to create a Docker image, specifying the base image, dependencies, and commands for building the image. +- A configuration file for building a Docker image. + A Dockerfile contains instructions to build a standard image for distributing the software. -## Dropwizard Metrics +## Dropwizard metrics - A Java library for measuring the performance of applications and providing support for various metric types. -## Amazon Elastic Block Store (EBS) - -- A scalable block storage service provided by Amazon Web Services. - ## Environment variables -- Variables used to pass information to running processes. +- Variables used to customize the runtime configuration for a process. ## Geo-distributed - The distribution of data or services across multiple geographic locations. +## Git + +- A distributed version control system used for tracking software artifacts. + ## GitHub -- A web-based platform for version control and collaboration using Git. +- A web-based platform for version control and community collaboration using Git. ## GitHub Actions -- A continuous integration and continuous deployment (CI/CD) service provided by GitHub, used for automating build, test, and deployment workflows. +- A continuous integration and continuous deployment (CI/CD) service provided by GitHub. + GitHub Actions automate the build, test, and deployment workflows. ## GitHub labels -- Tags assigned to GitHub issues or pull requests for organization, categorization, or workflow automation. +- Labels assigned to GitHub issues or pull requests for organization or workflow automation. ## GitHub pull request -- A proposed change to a repository submitted by a user through the GitHub platform. +- A proposed change to a GitHub repository submitted by a user. ## GitHub repository @@ -126,127 +267,67 @@ license: "This software is licensed under the Apache License version 2." ## GitHub workflow -- A series of automated steps defined in a YAML file that runs in response to events on a GitHub repository. - -## Git - -- A version control system used for tracking changes and collaborating on source code. - -## GPG/GnuPG - -- Gnu Privacy Guard or GnuPG, an open-source implementation of the OpenPGP standard, used for encrypting and signing files and emails. +- A series of automated steps triggered by specific events on a GitHub repository. ## Gradle -- A build automation tool for building, testing, and deploying projects. +- An automation tool for building, testing, and deploying projects. ## Gradlew -- A Gradle wrapper script, used for executing Gradle commands without installing Gradle separately. - -## Apache Gravitino - -- An open-source software platform originally created by Datastrato for high-performance, geo-distributed, and federated metadata lakes. Designed to manage metadata directly in different sources, types, and regions, providing unified metadata access for data and AI assets. - -## Apache Gravitino configuration file (gravitino.conf) - -- The configuration file for the Gravitino server, located in the `conf` directory. It follows the standard property file format and contains settings for the Gravitino server. +- A Gradle wrapper script used to execute Gradle commands. ## Hashes -- Cryptographic hash values generated from the contents of a file, often used for integrity verification. - -## HDFS - -- **HDFS** (Hadoop Distributed File System) is an open-source, distributed file system and a key component of the Apache Hadoop ecosystem. It is designed to store and process large-scale datasets, providing high reliability, fault tolerance, and performance for distributed storage solutions. +- Cryptographic hash values generated from some data. + A typical use case is to verify the integrity of a file. ## Headless -- A system without a graphical user interface. - -## HTTP port - -- The port number on which a server listens for incoming connections. - -## Apache Iceberg Hive catalog - -- The **Iceberg Hive catalog** is a specialized metadata service designed for the Apache Iceberg table format, allowing external systems to interact with Iceberg metadata via a Hive metastore thrift client. - -## Apache Iceberg REST catalog - -- The **Iceberg REST Catalog** is a specialized metadata service designed for the Apache Iceberg table format, allowing external systems to interact with Iceberg metadata via a RESTful API. - -## Apache Iceberg JDBC catalog - -- The **Iceberg JDBC Catalog** is a specialized metadata service designed for the Apache Iceberg table format, allowing external systems to interact with Iceberg metadata using JDBC (Java Database Connectivity). +- A system without a local console. ## Identity fields -- Fields in tables that define the identity of the table, specifying how rows in the table are uniquely identified. +- Fields in tables that define the identity of the records. + In the scope of a table, the identity fields are used as the unique identifier of a row. ## Integration tests -- Tests designed to ensure the correctness and compatibility of software when integrated into a unified system. - -## IP address - -- Internet Protocol address, a numerical label assigned to each device participating in a computer network. +- Tests that ensure software correctness and compatibility when integrating components into a larger system. ## Java Database Connectivity (JDBC) -- Java Database Connectivity, an API for connecting Java applications to relational databases. +- See [JDBC](#jdbc) ## Java Development Kits (JDKs) -- Software development kits for the Java programming language, including tools for compiling, debugging, and running Java applications. - -## Java Toolchain +- See [JDK](#jdk) -- A feature introduced in Gradle to detect and manage JDK versions. +## Java Management Extensions -## JDBC URI - -- The JDBC connection address specified in the catalog configuration, including details such as the database type, host, port, and database name. - -## JMX - -- Java Management Extensions provides tools for managing and monitoring Java applications. - -## JSON - -- JavaScript Object Notation, a lightweight data interchange format. +- See [JMX](#jmx) -## JWT(JSON Web Token) - -- A compact, URL-safe means of representing claims between two parties. - -## Java Virtual Machine (JVM) - -- A virtual machine that enables a computer to run Java applications, providing an abstraction layer between the application and the underlying hardware. - -## JVM metrics +## Java Toolchain -- Metrics related to the performance and behavior of the Java Virtual Machine (JVM), including memory usage, garbage collection, and buffer pool metrics. +- A Gradle feature for detecting and managing JDK versions. -## JVM instrumentation +## Java Virtual Machine -- The process of adding monitoring and management capabilities to the Java Virtual Machine, allowing for the collection of performance metrics. +- See [JVM](#jvm) ## Key pair - A pair of cryptographic keys, including a public key used for verification and a private key used for signing. -## KEYS file - -- A file containing public keys used to sign previous releases, necessary for verifying signatures. - ## Lakehouse -- **Lakehouse** refers to a modern data management architecture that combines elements of data lakes and data warehouses. It aims to provide a unified platform for storing, managing, and analyzing both raw unstructured data (similar to data lakes) and curated structured data. +- **Lakehouse** is a modern data management architecture that combines elements of data lakes and data warehouses. + It aims to provide a unified platform for storing, managing, and analyzing both raw unstructured data + (similar to data lakes) and curated structured data. ## Manifest -- A list of files and associated metadata that collectively define the structure and content of a release or distribution. +- A list of files and their associated metadata that collectively define the structure and content of a release or distribution. ## Merge operation @@ -254,7 +335,9 @@ license: "This software is licensed under the Apache License version 2." ## Metalake -- The top-level container for metadata. Typically, a metalake is a tenant-like mapping to an organization or a company. All the catalogs, users, and roles are under one metalake. +- The top-level container for metadata. + Typically, a metalake is a tenant-like mapping to an organization or a company. + All the catalogs, users, and roles are associated with one metalake. ## Metastore @@ -264,17 +347,14 @@ license: "This software is licensed under the Apache License version 2." - A distinct and separable part of a project. -## OrbStack - -- A tool mentioned as an alternative to Docker for macOS when running Gravitino integration tests. - ## Open authorization / OAuth -- A standard protocol for authorization that allows third-party applications to access user data without exposing user credentials. +- A standard protocol for authorization that allows third-party applications to authenticate a user. + The application doesn't need to access the user credentials. -## PGP Signature +## OrbStack -- A digital signature generated using the Pretty Good Privacy (PGP) algorithm, confirming the authenticity of a file. +- A tool mentioned as an alternative to Docker for macOS when running Gravitino integration tests. ## Private key @@ -282,31 +362,33 @@ license: "This software is licensed under the Apache License version 2." ## Properties -- Configurable settings and attributes associated with catalogs, schemas, and tables, to influence their behavior and storage. +- Configurable settings and attributes associated with catalogs, schemas, and tables. + The property settings influence the behavior and storage of the corresponding entities. ## Protocol buffers (protobuf) -- A method developed by Google for serializing structured data, similar to XML or JSON. It is often used for efficient and extensible communication between systems. +- A method developed by Google for serializing structured data, similar to XML or JSON. + It is often used for efficient and extensible communication between systems. ## Public key - An openly shared key used for verification, encryption, or other operations intended for public knowledge. -## Representational State Transfer (REST) +## Representational State Transfer -- A set of architectural principles for designing networked applications. +- See [REST](#rest) -## REST API (Representational State Transfer Application Programming Interface) +## RocksDB -- A set of rules and conventions for building and interacting with web services using standard HTTP methods. +- An open source key-value storage database. ## Schema - A logical container for organizing tables in a database. -## Secure Shell (SSH) +## Secure Shell -- Secure Shell, a cryptographic network protocol used for secure communication over a computer network. +- See [SSH](#ssh) ## Security group @@ -314,15 +396,8 @@ license: "This software is licensed under the Apache License version 2." ## Serde -- A Serialization/Deserialization library responsible for transforming data between a tabular format and a format suitable for storage or transmission. - -## SHA256 checksum - -- A cryptographic hash function used to verify the integrity of files. - -## SHA256 checksum file - -- A file containing the SHA256 hash value of another file, used for verification purposes. +- A serialization/deserialization library. + It can transform data between a tabular format and a format suitable for storage or transmission. ## Snapshot @@ -336,21 +411,22 @@ license: "This software is licensed under the Apache License version 2." - A tool or process used to enforce code formatting standards and apply automatic formatting to code. -## Structured Query Language (SQL) +## Structured Query Language -- A programming language used to manage and manipulate relational databases. +- See [SQL](#sql) ## Table - A structured set of data elements stored in columns and rows. -## Token +## Thrift -- A **token** in the context of computing and security commonly refers to a small, indivisible unit of data. Tokens play a crucial role in various domains, including authentication, authorization, and cryptographic systems. +- A network protocol used for communication with Hive Metastore Service (HMS). -## Thrift protocol +## Token -- The network protocol used for communication with Hive Metastore Service (HMS). +- A **token** in the context of computing and security is a small, indivisible unit of data. + Tokens play a crucial role in various domains, including authentication and authorization. ## Trino @@ -360,30 +436,22 @@ license: "This software is licensed under the Apache License version 2." - A connector module for integrating Gravitino with Trino. -## Trino Apache Gravitino connector documentation - -- Documentation providing information on using the Trino connector to access metadata in Gravitino. - ## Ubuntu - A Linux distribution based on Debian, widely used for cloud computing and servers. ## Unit test -- A type of testing where individual components or functions of a program are tested to ensure they work as expected in isolation. - -## URI - -- Uniform Resource Identifier, a string that identifies the name or resource on the internet. +- A type of software testing where individual components or functions of a program are tested. + Unit tests help to ensure that the component or function works as expected in isolation. ## Verification -- The process of confirming the authenticity and integrity of a release by checking its signature and associated hashes. +- The process of confirming the authenticity and integrity of a release. + This is usually done by checking its signature and associated hash values. -## WEB UI +## Web UI - A graphical interface accessible through a web browser. -## YAML -- YAML Ain't Markup Language, a human-readable data serialization format often used for configuration files. From 1e97f475970b7a446b16bea7c0c453dfff8a504d Mon Sep 17 00:00:00 2001 From: roryqi Date: Mon, 13 Jan 2025 14:50:10 +0800 Subject: [PATCH 10/40] [#6200] improvement(docs): Add Docker image details for 0.8.0 (#6202) ### What changes were proposed in this pull request? Add Docker image details for 0.8.0 ### Why are the changes needed? Fix: #6200 ### Does this PR introduce _any_ user-facing change? Add doc. ### How was this patch tested? No need. --- docs/docker-image-details.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/docs/docker-image-details.md b/docs/docker-image-details.md index 48b3bd191a1..a137923a694 100644 --- a/docs/docker-image-details.md +++ b/docs/docker-image-details.md @@ -19,6 +19,10 @@ docker run --rm -d -p 8090:8090 -p 9001:9001 apache/gravitino:0.7.0-incubating Changelog + +- apache/gravitino:0.8.0-incubating + - Based on Gravitino 0.8.0-incubating, you can know more information from 0.8.0-incubating [release notes](https://github.com/apache/gravitino/releases/tag/v0.8.0-incubating). + - apache/gravitino:0.7.0-incubating - Based on Gravitino 0.7.0-incubating, you can know more information from 0.7.0-incubating [release notes](https://github.com/apache/gravitino/releases/tag/v0.7.0-incubating). - Place bundle jars (gravitino-aws-bundle.jar, gravitino-gcp-bundle.jar, gravitino-aliyun-bundle.jar) in the `${GRAVITINO_HOME}/catalogs/hadoop/libs` folder to support the cloud storage catalog without manually adding the jars to the classpath. @@ -62,6 +66,12 @@ Changelog - apache/gravitino-iceberg-rest:0.8.0-incubating - Supports OSS and ADLS storage. + +- apache/gravitino-iceberg-rest:0.8.0-incubating + - Supports OSS and ADLS storage. + - Supports event listener. + - Supports audit log. + - apache/gravitino-iceberg-rest:0.7.0-incubating - Using JDBC catalog backend. - Supports S3 and GCS storage. @@ -100,10 +110,14 @@ Changelog ### Trino image Changelog + + +- apache/gravitino-playground:trino-435-gravitino-0.8.0-incubating + - Use Gravitino release 0.8.0-incubating Dockerfile to build the image. + - apache/gravitino-playground:trino-435-gravitino-0.7.0-incubating - Use Gravitino release 0.7.0-incubating Dockerfile to build the image. -Changelog - apache/gravitino-playground:trino-435-gravitino-0.6.1-incubating - Use Gravitino release 0.6.1-incubating Dockerfile to build the image. From d32af61bc56d902ce066cf96dde0449923a6aea5 Mon Sep 17 00:00:00 2001 From: Qi Yu Date: Mon, 13 Jan 2025 14:58:57 +0800 Subject: [PATCH 11/40] [#5545] fix(doris-catalog): Fix the problem that we can't set Doris table properties. (#6186) ### What changes were proposed in this pull request? Modify table properties SQL in alter table sentence to support setting table properties. ### Why are the changes needed? It's a bug. Fix: #5545 ### Does this PR introduce _any_ user-facing change? N/A. ### How was this patch tested? IT. --- .../doris/operation/DorisTableOperations.java | 17 ++++++++--------- .../doris/integration/test/CatalogDorisIT.java | 10 ++++++++++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/catalogs/catalog-jdbc-doris/src/main/java/org/apache/gravitino/catalog/doris/operation/DorisTableOperations.java b/catalogs/catalog-jdbc-doris/src/main/java/org/apache/gravitino/catalog/doris/operation/DorisTableOperations.java index aa6348e2f71..829088f0131 100644 --- a/catalogs/catalog-jdbc-doris/src/main/java/org/apache/gravitino/catalog/doris/operation/DorisTableOperations.java +++ b/catalogs/catalog-jdbc-doris/src/main/java/org/apache/gravitino/catalog/doris/operation/DorisTableOperations.java @@ -567,10 +567,6 @@ protected String generateAlterTableSql( alterSql.add("MODIFY COMMENT \"" + newComment + "\""); } - if (!setProperties.isEmpty()) { - alterSql.add(generateTableProperties(setProperties)); - } - if (CollectionUtils.isEmpty(alterSql)) { return ""; } @@ -602,11 +598,14 @@ private String updateColumnNullabilityDefinition( } private String generateTableProperties(List setProperties) { - return setProperties.stream() - .map( - setProperty -> - String.format("\"%s\" = \"%s\"", setProperty.getProperty(), setProperty.getValue())) - .collect(Collectors.joining(",\n")); + String properties = + setProperties.stream() + .map( + setProperty -> + String.format( + "\"%s\" = \"%s\"", setProperty.getProperty(), setProperty.getValue())) + .collect(Collectors.joining(",\n")); + return "set (" + properties + ")"; } private String updateColumnCommentFieldDefinition( diff --git a/catalogs/catalog-jdbc-doris/src/test/java/org/apache/gravitino/catalog/doris/integration/test/CatalogDorisIT.java b/catalogs/catalog-jdbc-doris/src/test/java/org/apache/gravitino/catalog/doris/integration/test/CatalogDorisIT.java index 9288c9616bc..9d2c798ae7e 100644 --- a/catalogs/catalog-jdbc-doris/src/test/java/org/apache/gravitino/catalog/doris/integration/test/CatalogDorisIT.java +++ b/catalogs/catalog-jdbc-doris/src/test/java/org/apache/gravitino/catalog/doris/integration/test/CatalogDorisIT.java @@ -577,6 +577,16 @@ void testAlterDorisTable() { .pollInterval(WAIT_INTERVAL_IN_SECONDS, TimeUnit.SECONDS) .untilAsserted( () -> assertEquals(4, tableCatalog.loadTable(tableIdentifier).columns().length)); + + // set property + tableCatalog.alterTable(tableIdentifier, TableChange.setProperty("in_memory", "true")); + Awaitility.await() + .atMost(MAX_WAIT_IN_SECONDS, TimeUnit.SECONDS) + .pollInterval(WAIT_INTERVAL_IN_SECONDS, TimeUnit.SECONDS) + .untilAsserted( + () -> + assertEquals( + "true", tableCatalog.loadTable(tableIdentifier).properties().get("in_memory"))); } @Test From 7de40b88c6aa0124edc42eb44c460bd487782272 Mon Sep 17 00:00:00 2001 From: mchades Date: Mon, 13 Jan 2025 16:26:19 +0800 Subject: [PATCH 12/40] [#5721] improvement(mysql-catalog): add column not null limitation in unique index (#6183) ### What changes were proposed in this pull request? add column not null limitation in unique index ### Why are the changes needed? mysql will automatically change the null column in unique index to not null, so we add the limitation at creation Fix: #5721 ### Does this PR introduce _any_ user-facing change? yes, limitation for mysql unique index is more strict ### How was this patch tested? tests added --- .../mysql/operation/MysqlTableOperations.java | 30 +++++++++++++++++++ .../integration/test/CatalogMysqlIT.java | 21 +++++++++++++ .../operation/TestMysqlTableOperations.java | 4 +-- 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/catalogs/catalog-jdbc-mysql/src/main/java/org/apache/gravitino/catalog/mysql/operation/MysqlTableOperations.java b/catalogs/catalog-jdbc-mysql/src/main/java/org/apache/gravitino/catalog/mysql/operation/MysqlTableOperations.java index b8cc2f87233..36b4daebf9b 100644 --- a/catalogs/catalog-jdbc-mysql/src/main/java/org/apache/gravitino/catalog/mysql/operation/MysqlTableOperations.java +++ b/catalogs/catalog-jdbc-mysql/src/main/java/org/apache/gravitino/catalog/mysql/operation/MysqlTableOperations.java @@ -106,6 +106,7 @@ protected String generateCreateTableSql( } } + validateIndexes(indexes, columns); appendIndexesSql(indexes, sqlBuilder); sqlBuilder.append("\n)"); @@ -642,4 +643,33 @@ private StringBuilder appendColumnDefinition(JdbcColumn column, StringBuilder sq private static String quote(String name) { return BACK_QUOTE + name + BACK_QUOTE; } + + /** + * Verify the columns in the index. + * + * @param columns jdbc column + * @param indexes table indexes + */ + private static void validateIndexes(Index[] indexes, JdbcColumn[] columns) { + Map columnMap = + Arrays.stream(columns).collect(Collectors.toMap(JdbcColumn::name, c -> c)); + for (Index index : indexes) { + if (index.type() == Index.IndexType.UNIQUE_KEY) { + // the column in the unique index must be not null + for (String[] colNames : index.fieldNames()) { + JdbcColumn column = columnMap.get(colNames[0]); + Preconditions.checkArgument( + column != null, + "Column %s in the unique index %s does not exist in the table", + colNames[0], + index.name()); + Preconditions.checkArgument( + !column.nullable(), + "Column %s in the unique index %s must be a not null column", + colNames[0], + index.name()); + } + } + } + } } diff --git a/catalogs/catalog-jdbc-mysql/src/test/java/org/apache/gravitino/catalog/mysql/integration/test/CatalogMysqlIT.java b/catalogs/catalog-jdbc-mysql/src/test/java/org/apache/gravitino/catalog/mysql/integration/test/CatalogMysqlIT.java index a80da4795a0..9bd949b7b31 100644 --- a/catalogs/catalog-jdbc-mysql/src/test/java/org/apache/gravitino/catalog/mysql/integration/test/CatalogMysqlIT.java +++ b/catalogs/catalog-jdbc-mysql/src/test/java/org/apache/gravitino/catalog/mysql/integration/test/CatalogMysqlIT.java @@ -1037,6 +1037,27 @@ void testCreateTableIndex() { Assertions.assertEquals(2, table.index().length); Assertions.assertNotNull(table.index()[0].name()); Assertions.assertNotNull(table.index()[1].name()); + + Column notNullCol = Column.of("col_6", Types.LongType.get(), "id", true, false, null); + Exception exception = + assertThrows( + IllegalArgumentException.class, + () -> + tableCatalog.createTable( + tableIdent, + new Column[] {notNullCol}, + table_comment, + properties, + Transforms.EMPTY_TRANSFORM, + Distributions.NONE, + new SortOrder[0], + new Index[] { + Indexes.of(Index.IndexType.UNIQUE_KEY, null, new String[][] {{"col_6"}}), + })); + Assertions.assertTrue( + exception + .getMessage() + .contains("Column col_6 in the unique index null must be a not null column")); } @Test diff --git a/catalogs/catalog-jdbc-mysql/src/test/java/org/apache/gravitino/catalog/mysql/operation/TestMysqlTableOperations.java b/catalogs/catalog-jdbc-mysql/src/test/java/org/apache/gravitino/catalog/mysql/operation/TestMysqlTableOperations.java index 9eac348cd91..923e20fa0c0 100644 --- a/catalogs/catalog-jdbc-mysql/src/test/java/org/apache/gravitino/catalog/mysql/operation/TestMysqlTableOperations.java +++ b/catalogs/catalog-jdbc-mysql/src/test/java/org/apache/gravitino/catalog/mysql/operation/TestMysqlTableOperations.java @@ -64,7 +64,7 @@ public void testOperationTable() { .withName("col_1") .withType(VARCHAR) .withComment("test_comment") - .withNullable(true) + .withNullable(false) .build()); columns.add( JdbcColumn.builder() @@ -573,7 +573,7 @@ public void testCreateAndLoadTable() { JdbcColumn.builder() .withName("col_4") .withType(Types.DateType.get()) - .withNullable(true) + .withNullable(false) .withComment("date") .withDefaultValue(Column.DEFAULT_VALUE_NOT_SET) .build()); From 6138379dc316ef31e45c1594670f2e226c744d7a Mon Sep 17 00:00:00 2001 From: Qi Yu Date: Mon, 13 Jan 2025 18:49:05 +0800 Subject: [PATCH 13/40] [#5100] improvement(docs): Add extra documents to clarify the engine type of MySQL catalog (#6209) ### What changes were proposed in this pull request? Add more details about the usage of engine type for MySQL catalog. ### Why are the changes needed? The value of engine type may be influenced by many factors like MySQL version, configurations and so on, we need to clarify it. Fix: #5100 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? N/A --- docs/jdbc-mysql-catalog.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/jdbc-mysql-catalog.md b/docs/jdbc-mysql-catalog.md index c761006a000..808e229a21d 100644 --- a/docs/jdbc-mysql-catalog.md +++ b/docs/jdbc-mysql-catalog.md @@ -186,6 +186,12 @@ Although MySQL itself does not support table properties, Gravitino offers table | `engine` | The engine used by the table. For example `MyISAM`, `MEMORY`, `CSV`, `ARCHIVE`, `BLACKHOLE`, `FEDERATED`, `ndbinfo`, `MRG_MYISAM`, `PERFORMANCE_SCHEMA`. | `InnoDB` | No | No | Yes | 0.4.0 | | `auto-increment-offset` | Used to specify the starting value of the auto-increment field. | (none) | No | No | Yes | 0.4.0 | + +:::note +Some MySQL storage engines, such as FEDERATED, are not enabled by default and require additional configuration to use. For example, to enable the FEDERATED engine, set federated=1 in the MySQL configuration file. Similarly, engines like ndbinfo, MRG_MYISAM, and PERFORMANCE_SCHEMA may also require specific prerequisites or configurations. For detailed instructions, +refer to the [MySQL documentation](https://dev.mysql.com/doc/refman/8.0/en/federated-storage-engine.html). +::: + ### Table indexes - Supports PRIMARY_KEY and UNIQUE_KEY. From e4151e92e32864c603cb371559e19efdcae6262e Mon Sep 17 00:00:00 2001 From: yangyang zhong <35210666+hdygxsj@users.noreply.github.com> Date: Mon, 13 Jan 2025 22:56:19 +0800 Subject: [PATCH 14/40] [#5192] [#5193] feat(flink): Support Catalog&Schema Operation DDL for paimon-catalog (#5818) ### What changes were proposed in this pull request? Support Catalog Operation DDL for paimon-catalog ### Why are the changes needed? Fix #5192 #5193 ### Does this PR introduce _any_ user-facing change? None ### How was this patch tested? org.apache.gravitino.flink.connector.paimon.TestPaimonPropertiesConverter org.apache.gravitino.flink.connector.integration.test.paimon.FlinkPaimonCatalogIT --- .../paimon/PaimonPropertiesUtils.java | 46 +++++--- flink-connector/flink/build.gradle.kts | 5 +- .../paimon/GravitinoPaimonCatalog.java | 48 ++++++++ .../paimon/GravitinoPaimonCatalogFactory.java | 80 +++++++++++++ .../GravitinoPaimonCatalogFactoryOptions.java | 26 ++++ .../paimon/PaimonPropertiesConverter.java | 80 +++++++++++++ .../store/GravitinoCatalogStore.java | 3 +- .../org.apache.flink.table.factories.Factory | 3 +- .../integration/test/FlinkCommonIT.java | 54 ++++++++- .../test/paimon/FlinkPaimonCatalogIT.java | 111 ++++++++++++++++++ .../paimon/TestPaimonPropertiesConverter.java | 101 ++++++++++++++++ 11 files changed, 536 insertions(+), 21 deletions(-) create mode 100644 flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalog.java create mode 100644 flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalogFactory.java create mode 100644 flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalogFactoryOptions.java create mode 100644 flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/PaimonPropertiesConverter.java create mode 100644 flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/paimon/FlinkPaimonCatalogIT.java create mode 100644 flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/paimon/TestPaimonPropertiesConverter.java diff --git a/catalogs/catalog-common/src/main/java/org/apache/gravitino/catalog/lakehouse/paimon/PaimonPropertiesUtils.java b/catalogs/catalog-common/src/main/java/org/apache/gravitino/catalog/lakehouse/paimon/PaimonPropertiesUtils.java index 0dcf24f3a67..7b1832fe56d 100644 --- a/catalogs/catalog-common/src/main/java/org/apache/gravitino/catalog/lakehouse/paimon/PaimonPropertiesUtils.java +++ b/catalogs/catalog-common/src/main/java/org/apache/gravitino/catalog/lakehouse/paimon/PaimonPropertiesUtils.java @@ -32,25 +32,41 @@ public class PaimonPropertiesUtils { // will only need to set the configuration 'catalog-backend' in Gravitino and Gravitino will // change it to `catalogType` automatically and pass it to Paimon. public static final Map GRAVITINO_CONFIG_TO_PAIMON; + public static final Map PAIMON_CATALOG_CONFIG_TO_GRAVITINO; static { - Map map = new HashMap(); - map.put(PaimonConstants.CATALOG_BACKEND, PaimonConstants.CATALOG_BACKEND); - map.put(PaimonConstants.GRAVITINO_JDBC_DRIVER, PaimonConstants.GRAVITINO_JDBC_DRIVER); - map.put(PaimonConstants.GRAVITINO_JDBC_USER, PaimonConstants.PAIMON_JDBC_USER); - map.put(PaimonConstants.GRAVITINO_JDBC_PASSWORD, PaimonConstants.PAIMON_JDBC_PASSWORD); - map.put(PaimonConstants.URI, PaimonConstants.URI); - map.put(PaimonConstants.WAREHOUSE, PaimonConstants.WAREHOUSE); - map.put(PaimonConstants.CATALOG_BACKEND_NAME, PaimonConstants.CATALOG_BACKEND_NAME); + Map gravitinoConfigToPaimon = new HashMap<>(); + Map paimonCatalogConfigToGravitino = new HashMap<>(); + gravitinoConfigToPaimon.put(PaimonConstants.CATALOG_BACKEND, PaimonConstants.CATALOG_BACKEND); + gravitinoConfigToPaimon.put( + PaimonConstants.GRAVITINO_JDBC_DRIVER, PaimonConstants.GRAVITINO_JDBC_DRIVER); + gravitinoConfigToPaimon.put( + PaimonConstants.GRAVITINO_JDBC_USER, PaimonConstants.PAIMON_JDBC_USER); + gravitinoConfigToPaimon.put( + PaimonConstants.GRAVITINO_JDBC_PASSWORD, PaimonConstants.PAIMON_JDBC_PASSWORD); + gravitinoConfigToPaimon.put(PaimonConstants.URI, PaimonConstants.URI); + gravitinoConfigToPaimon.put(PaimonConstants.WAREHOUSE, PaimonConstants.WAREHOUSE); + gravitinoConfigToPaimon.put( + PaimonConstants.CATALOG_BACKEND_NAME, PaimonConstants.CATALOG_BACKEND_NAME); // S3 - map.put(S3Properties.GRAVITINO_S3_ENDPOINT, PaimonConstants.S3_ENDPOINT); - map.put(S3Properties.GRAVITINO_S3_ACCESS_KEY_ID, PaimonConstants.S3_ACCESS_KEY); - map.put(S3Properties.GRAVITINO_S3_SECRET_ACCESS_KEY, PaimonConstants.S3_SECRET_KEY); + gravitinoConfigToPaimon.put(S3Properties.GRAVITINO_S3_ENDPOINT, PaimonConstants.S3_ENDPOINT); + gravitinoConfigToPaimon.put( + S3Properties.GRAVITINO_S3_ACCESS_KEY_ID, PaimonConstants.S3_ACCESS_KEY); + gravitinoConfigToPaimon.put( + S3Properties.GRAVITINO_S3_SECRET_ACCESS_KEY, PaimonConstants.S3_SECRET_KEY); // OSS - map.put(OSSProperties.GRAVITINO_OSS_ENDPOINT, PaimonConstants.OSS_ENDPOINT); - map.put(OSSProperties.GRAVITINO_OSS_ACCESS_KEY_ID, PaimonConstants.OSS_ACCESS_KEY); - map.put(OSSProperties.GRAVITINO_OSS_ACCESS_KEY_SECRET, PaimonConstants.OSS_SECRET_KEY); - GRAVITINO_CONFIG_TO_PAIMON = Collections.unmodifiableMap(map); + gravitinoConfigToPaimon.put(OSSProperties.GRAVITINO_OSS_ENDPOINT, PaimonConstants.OSS_ENDPOINT); + gravitinoConfigToPaimon.put( + OSSProperties.GRAVITINO_OSS_ACCESS_KEY_ID, PaimonConstants.OSS_ACCESS_KEY); + gravitinoConfigToPaimon.put( + OSSProperties.GRAVITINO_OSS_ACCESS_KEY_SECRET, PaimonConstants.OSS_SECRET_KEY); + GRAVITINO_CONFIG_TO_PAIMON = Collections.unmodifiableMap(gravitinoConfigToPaimon); + gravitinoConfigToPaimon.forEach( + (key, value) -> { + paimonCatalogConfigToGravitino.put(value, key); + }); + PAIMON_CATALOG_CONFIG_TO_GRAVITINO = + Collections.unmodifiableMap(paimonCatalogConfigToGravitino); } /** diff --git a/flink-connector/flink/build.gradle.kts b/flink-connector/flink/build.gradle.kts index 9e2a48c036c..f137a3eae1b 100644 --- a/flink-connector/flink/build.gradle.kts +++ b/flink-connector/flink/build.gradle.kts @@ -26,6 +26,7 @@ repositories { mavenCentral() } +var paimonVersion: String = libs.versions.paimon.get() val flinkVersion: String = libs.versions.flink.get() val flinkMajorVersion: String = flinkVersion.substringBeforeLast(".") @@ -38,14 +39,15 @@ val scalaVersion: String = "2.12" val artifactName = "${rootProject.name}-flink-${flinkMajorVersion}_$scalaVersion" dependencies { + implementation(project(":core")) implementation(project(":catalogs:catalog-common")) implementation(libs.guava) compileOnly(project(":clients:client-java-runtime", configuration = "shadow")) - compileOnly("org.apache.flink:flink-connector-hive_$scalaVersion:$flinkVersion") compileOnly("org.apache.flink:flink-table-common:$flinkVersion") compileOnly("org.apache.flink:flink-table-api-java:$flinkVersion") + compileOnly("org.apache.paimon:paimon-flink-1.18:$paimonVersion") compileOnly(libs.hive2.exec) { artifact { @@ -90,6 +92,7 @@ dependencies { testImplementation("org.apache.flink:flink-connector-hive_$scalaVersion:$flinkVersion") testImplementation("org.apache.flink:flink-table-common:$flinkVersion") testImplementation("org.apache.flink:flink-table-api-java:$flinkVersion") + testImplementation("org.apache.paimon:paimon-flink-$flinkMajorVersion:$paimonVersion") testImplementation(libs.hive2.exec) { artifact { diff --git a/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalog.java b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalog.java new file mode 100644 index 00000000000..017ac6e7085 --- /dev/null +++ b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalog.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.flink.connector.paimon; + +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.gravitino.flink.connector.PartitionConverter; +import org.apache.gravitino.flink.connector.PropertiesConverter; +import org.apache.gravitino.flink.connector.catalog.BaseCatalog; + +/** + * The GravitinoPaimonCatalog class is an implementation of the BaseCatalog class that is used to + * proxy the PaimonCatalog class. + */ +public class GravitinoPaimonCatalog extends BaseCatalog { + + private final AbstractCatalog paimonCatalog; + + protected GravitinoPaimonCatalog( + String catalogName, + AbstractCatalog paimonCatalog, + PropertiesConverter propertiesConverter, + PartitionConverter partitionConverter) { + super(catalogName, paimonCatalog.getDefaultDatabase(), propertiesConverter, partitionConverter); + this.paimonCatalog = paimonCatalog; + } + + @Override + protected AbstractCatalog realCatalog() { + return paimonCatalog; + } +} diff --git a/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalogFactory.java b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalogFactory.java new file mode 100644 index 00000000000..52489fc667f --- /dev/null +++ b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalogFactory.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.flink.connector.paimon; + +import java.util.Collections; +import java.util.Set; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.table.catalog.Catalog; +import org.apache.gravitino.flink.connector.DefaultPartitionConverter; +import org.apache.gravitino.flink.connector.PartitionConverter; +import org.apache.gravitino.flink.connector.PropertiesConverter; +import org.apache.gravitino.flink.connector.catalog.BaseCatalogFactory; +import org.apache.paimon.flink.FlinkCatalog; +import org.apache.paimon.flink.FlinkCatalogFactory; + +/** + * Factory for creating instances of {@link GravitinoPaimonCatalog}. It will be created by SPI + * discovery in Flink. + */ +public class GravitinoPaimonCatalogFactory implements BaseCatalogFactory { + + @Override + public Catalog createCatalog(Context context) { + FlinkCatalog catalog = new FlinkCatalogFactory().createCatalog(context); + return new GravitinoPaimonCatalog( + context.getName(), catalog, propertiesConverter(), partitionConverter()); + } + + @Override + public String factoryIdentifier() { + return GravitinoPaimonCatalogFactoryOptions.IDENTIFIER; + } + + @Override + public Set> requiredOptions() { + return Collections.emptySet(); + } + + @Override + public Set> optionalOptions() { + return Collections.emptySet(); + } + + @Override + public String gravitinoCatalogProvider() { + return "lakehouse-paimon"; + } + + @Override + public org.apache.gravitino.Catalog.Type gravitinoCatalogType() { + return org.apache.gravitino.Catalog.Type.RELATIONAL; + } + + @Override + public PropertiesConverter propertiesConverter() { + return PaimonPropertiesConverter.INSTANCE; + } + + @Override + public PartitionConverter partitionConverter() { + return DefaultPartitionConverter.INSTANCE; + } +} diff --git a/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalogFactoryOptions.java b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalogFactoryOptions.java new file mode 100644 index 00000000000..dd78f96d24b --- /dev/null +++ b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalogFactoryOptions.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.flink.connector.paimon; + +public class GravitinoPaimonCatalogFactoryOptions { + + /** Identifier for the {@link GravitinoPaimonCatalog}. */ + public static final String IDENTIFIER = "gravitino-paimon"; +} diff --git a/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/PaimonPropertiesConverter.java b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/PaimonPropertiesConverter.java new file mode 100644 index 00000000000..58613bee37d --- /dev/null +++ b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/PaimonPropertiesConverter.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.flink.connector.paimon; + +import com.google.common.collect.Maps; +import java.util.HashMap; +import java.util.Map; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.catalog.CommonCatalogOptions; +import org.apache.gravitino.catalog.lakehouse.paimon.PaimonConstants; +import org.apache.gravitino.catalog.lakehouse.paimon.PaimonPropertiesUtils; +import org.apache.gravitino.flink.connector.PropertiesConverter; +import org.apache.paimon.catalog.FileSystemCatalogFactory; + +public class PaimonPropertiesConverter implements PropertiesConverter { + + public static final PaimonPropertiesConverter INSTANCE = new PaimonPropertiesConverter(); + + private PaimonPropertiesConverter() {} + + @Override + public Map toGravitinoCatalogProperties(Configuration flinkConf) { + Map gravitinoProperties = Maps.newHashMap(); + Map flinkConfMap = flinkConf.toMap(); + for (Map.Entry entry : flinkConfMap.entrySet()) { + String gravitinoKey = + PaimonPropertiesUtils.PAIMON_CATALOG_CONFIG_TO_GRAVITINO.get(entry.getKey()); + if (gravitinoKey != null) { + gravitinoProperties.put(gravitinoKey, entry.getValue()); + } else if (!entry.getKey().startsWith(FLINK_PROPERTY_PREFIX)) { + gravitinoProperties.put(FLINK_PROPERTY_PREFIX + entry.getKey(), entry.getValue()); + } else { + gravitinoProperties.put(entry.getKey(), entry.getValue()); + } + } + gravitinoProperties.put( + PaimonConstants.CATALOG_BACKEND, + flinkConfMap.getOrDefault(PaimonConstants.METASTORE, FileSystemCatalogFactory.IDENTIFIER)); + return gravitinoProperties; + } + + @Override + public Map toFlinkCatalogProperties(Map gravitinoProperties) { + Map all = new HashMap<>(); + gravitinoProperties.forEach( + (key, value) -> { + String flinkConfigKey = key; + if (key.startsWith(PropertiesConverter.FLINK_PROPERTY_PREFIX)) { + flinkConfigKey = key.substring(PropertiesConverter.FLINK_PROPERTY_PREFIX.length()); + } + all.put(flinkConfigKey, value); + }); + Map paimonCatalogProperties = + PaimonPropertiesUtils.toPaimonCatalogProperties(all); + paimonCatalogProperties.put( + PaimonConstants.METASTORE, + paimonCatalogProperties.getOrDefault( + PaimonConstants.CATALOG_BACKEND, FileSystemCatalogFactory.IDENTIFIER)); + paimonCatalogProperties.put( + CommonCatalogOptions.CATALOG_TYPE.key(), GravitinoPaimonCatalogFactoryOptions.IDENTIFIER); + return paimonCatalogProperties; + } +} diff --git a/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/store/GravitinoCatalogStore.java b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/store/GravitinoCatalogStore.java index 92e778ce297..4c29b7fde3b 100644 --- a/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/store/GravitinoCatalogStore.java +++ b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/store/GravitinoCatalogStore.java @@ -54,7 +54,8 @@ public GravitinoCatalogStore(GravitinoCatalogManager catalogManager) { public void storeCatalog(String catalogName, CatalogDescriptor descriptor) throws CatalogException { Configuration configuration = descriptor.getConfiguration(); - BaseCatalogFactory catalogFactory = getCatalogFactory(configuration.toMap()); + Map gravitino = configuration.toMap(); + BaseCatalogFactory catalogFactory = getCatalogFactory(gravitino); Map gravitinoProperties = catalogFactory.propertiesConverter().toGravitinoCatalogProperties(configuration); gravitinoCatalogManager.createCatalog( diff --git a/flink-connector/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink-connector/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory index c9d9c92b5ef..a535afb6dc2 100644 --- a/flink-connector/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory +++ b/flink-connector/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory @@ -18,4 +18,5 @@ # org.apache.gravitino.flink.connector.store.GravitinoCatalogStoreFactory -org.apache.gravitino.flink.connector.hive.GravitinoHiveCatalogFactory \ No newline at end of file +org.apache.gravitino.flink.connector.hive.GravitinoHiveCatalogFactory +org.apache.gravitino.flink.connector.paimon.GravitinoPaimonCatalogFactory \ No newline at end of file diff --git a/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/FlinkCommonIT.java b/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/FlinkCommonIT.java index 2d022b4a8a4..5a363e4e51b 100644 --- a/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/FlinkCommonIT.java +++ b/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/FlinkCommonIT.java @@ -27,6 +27,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Optional; @@ -53,11 +54,24 @@ import org.apache.gravitino.rel.types.Types; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIf; public abstract class FlinkCommonIT extends FlinkEnvIT { protected abstract Catalog currentCatalog(); + protected boolean supportTableOperation() { + return true; + } + + protected boolean supportColumnOperation() { + return true; + } + + protected boolean supportSchemaOperationWithCommentAndOptions() { + return true; + } + @Test public void testCreateSchema() { doWithCatalog( @@ -76,7 +90,29 @@ public void testCreateSchema() { } @Test - public void testGetSchema() { + public void testGetSchemaWithoutCommentAndOption() { + doWithCatalog( + currentCatalog(), + catalog -> { + String schema = "test_get_schema"; + try { + TestUtils.assertTableResult( + sql("CREATE DATABASE IF NOT EXISTS %s", schema), ResultKind.SUCCESS); + TestUtils.assertTableResult(tableEnv.executeSql("USE " + schema), ResultKind.SUCCESS); + + catalog.asSchemas().schemaExists(schema); + Schema loadedSchema = catalog.asSchemas().loadSchema(schema); + Assertions.assertEquals(schema, loadedSchema.name()); + } finally { + catalog.asSchemas().dropSchema(schema, true); + Assertions.assertFalse(catalog.asSchemas().schemaExists(schema)); + } + }); + } + + @Test + @EnabledIf("supportSchemaOperationWithCommentAndOptions") + public void testGetSchemaWithCommentAndOptions() { doWithCatalog( currentCatalog(), catalog -> { @@ -114,7 +150,6 @@ public void testListSchema() { doWithCatalog( currentCatalog(), catalog -> { - Assertions.assertEquals(1, catalog.asSchemas().listSchemas().length); String schema = "test_list_schema"; String schema2 = "test_list_schema2"; String schema3 = "test_list_schema3"; @@ -135,6 +170,7 @@ public void testListSchema() { Row.of(schema3)); String[] schemas = catalog.asSchemas().listSchemas(); + Arrays.sort(schemas); Assertions.assertEquals(4, schemas.length); Assertions.assertEquals("default", schemas[0]); Assertions.assertEquals(schema, schemas[1]); @@ -150,7 +186,8 @@ public void testListSchema() { } @Test - public void testAlterSchema() { + @EnabledIf("supportSchemaOperationWithCommentAndOptions") + public void testAlterSchemaWithCommentAndOptions() { doWithCatalog( currentCatalog(), catalog -> { @@ -188,6 +225,7 @@ public void testAlterSchema() { } @Test + @EnabledIf("supportTableOperation") public void testCreateSimpleTable() { String databaseName = "test_create_no_partition_table_db"; String tableName = "test_create_no_partition_table"; @@ -236,6 +274,7 @@ public void testCreateSimpleTable() { } @Test + @EnabledIf("supportTableOperation") public void testListTables() { String newSchema = "test_list_table_catalog"; Column[] columns = new Column[] {Column.of("user_id", Types.IntegerType.get(), "USER_ID")}; @@ -268,6 +307,7 @@ public void testListTables() { } @Test + @EnabledIf("supportTableOperation") public void testDropTable() { String databaseName = "test_drop_table_db"; doWithSchema( @@ -289,6 +329,7 @@ public void testDropTable() { } @Test + @EnabledIf("supportTableOperation") public void testGetSimpleTable() { String databaseName = "test_get_simple_table"; Column[] columns = @@ -342,6 +383,7 @@ public void testGetSimpleTable() { } @Test + @EnabledIf("supportColumnOperation") public void testRenameColumn() { String databaseName = "test_rename_column_db"; String tableName = "test_rename_column"; @@ -377,6 +419,7 @@ public void testRenameColumn() { } @Test + @EnabledIf("supportColumnOperation") public void testAlterTableComment() { String databaseName = "test_alter_table_comment_database"; String tableName = "test_alter_table_comment"; @@ -436,6 +479,7 @@ public void testAlterTableComment() { } @Test + @EnabledIf("supportColumnOperation") public void testAlterTableAddColumn() { String databaseName = "test_alter_table_add_column_db"; String tableName = "test_alter_table_add_column"; @@ -471,6 +515,7 @@ public void testAlterTableAddColumn() { } @Test + @EnabledIf("supportColumnOperation") public void testAlterTableDropColumn() { String databaseName = "test_alter_table_drop_column_db"; String tableName = "test_alter_table_drop_column"; @@ -501,6 +546,7 @@ public void testAlterTableDropColumn() { } @Test + @EnabledIf("supportColumnOperation") public void testAlterColumnTypeAndChangeOrder() { String databaseName = "test_alter_table_alter_column_db"; String tableName = "test_alter_table_rename_column"; @@ -542,6 +588,7 @@ public void testAlterColumnTypeAndChangeOrder() { } @Test + @EnabledIf("supportTableOperation") public void testRenameTable() { String databaseName = "test_rename_table_db"; String tableName = "test_rename_table"; @@ -569,6 +616,7 @@ public void testRenameTable() { } @Test + @EnabledIf("supportTableOperation") public void testAlterTableProperties() { String databaseName = "test_alter_table_properties_db"; String tableName = "test_alter_table_properties"; diff --git a/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/paimon/FlinkPaimonCatalogIT.java b/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/paimon/FlinkPaimonCatalogIT.java new file mode 100644 index 00000000000..10fab3567a3 --- /dev/null +++ b/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/paimon/FlinkPaimonCatalogIT.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.flink.connector.integration.test.paimon; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; +import java.nio.file.Path; +import java.util.Map; +import org.apache.gravitino.Catalog; +import org.apache.gravitino.catalog.lakehouse.paimon.PaimonConstants; +import org.apache.gravitino.flink.connector.integration.test.FlinkCommonIT; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +@Tag("gravitino-docker-test") +public class FlinkPaimonCatalogIT extends FlinkCommonIT { + + @TempDir private static Path warehouseDir; + + private static final String DEFAULT_PAIMON_CATALOG = + "test_flink_paimon_filesystem_schema_catalog"; + + private static org.apache.gravitino.Catalog catalog; + + @Override + protected boolean supportColumnOperation() { + return false; + } + + @Override + protected boolean supportTableOperation() { + return false; + } + + @Override + protected boolean supportSchemaOperationWithCommentAndOptions() { + return false; + } + + protected Catalog currentCatalog() { + return catalog; + } + + @BeforeAll + static void setup() { + initPaimonCatalog(); + } + + @AfterAll + static void stop() { + Preconditions.checkNotNull(metalake); + metalake.dropCatalog(DEFAULT_PAIMON_CATALOG, true); + } + + private static void initPaimonCatalog() { + Preconditions.checkNotNull(metalake); + catalog = + metalake.createCatalog( + DEFAULT_PAIMON_CATALOG, + org.apache.gravitino.Catalog.Type.RELATIONAL, + "lakehouse-paimon", + null, + ImmutableMap.of( + PaimonConstants.CATALOG_BACKEND, + "filesystem", + "warehouse", + warehouseDir.toString())); + } + + @Test + public void testCreateGravitinoPaimonCatalogUsingSQL() { + tableEnv.useCatalog(DEFAULT_CATALOG); + int numCatalogs = tableEnv.listCatalogs().length; + String catalogName = "gravitino_hive_sql"; + String warehouse = warehouseDir.toString(); + tableEnv.executeSql( + String.format( + "create catalog %s with (" + + "'type'='gravitino-paimon', " + + "'warehouse'='%s'," + + "'catalog.backend'='filesystem'" + + ")", + catalogName, warehouse)); + String[] catalogs = tableEnv.listCatalogs(); + Assertions.assertEquals(numCatalogs + 1, catalogs.length, "Should create a new catalog"); + Assertions.assertTrue(metalake.catalogExists(catalogName)); + org.apache.gravitino.Catalog gravitinoCatalog = metalake.loadCatalog(catalogName); + Map properties = gravitinoCatalog.properties(); + Assertions.assertEquals(warehouse, properties.get("warehouse")); + } +} diff --git a/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/paimon/TestPaimonPropertiesConverter.java b/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/paimon/TestPaimonPropertiesConverter.java new file mode 100644 index 00000000000..4496d94c0a4 --- /dev/null +++ b/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/paimon/TestPaimonPropertiesConverter.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.flink.connector.paimon; + +import com.google.common.collect.ImmutableMap; +import java.util.Map; +import org.apache.flink.configuration.Configuration; +import org.apache.gravitino.catalog.lakehouse.paimon.PaimonConstants; +import org.apache.gravitino.flink.connector.PropertiesConverter; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +/** Test for {@link PaimonPropertiesConverter} */ +public class TestPaimonPropertiesConverter { + + private static final PaimonPropertiesConverter CONVERTER = PaimonPropertiesConverter.INSTANCE; + + private static final String localWarehouse = "file:///tmp/paimon_warehouse"; + + @Test + public void testToPaimonFileSystemCatalog() { + Map catalogProperties = ImmutableMap.of("warehouse", localWarehouse); + Map flinkCatalogProperties = + CONVERTER.toFlinkCatalogProperties(catalogProperties); + Assertions.assertEquals( + GravitinoPaimonCatalogFactoryOptions.IDENTIFIER, flinkCatalogProperties.get("type")); + Assertions.assertEquals(localWarehouse, flinkCatalogProperties.get("warehouse")); + } + + @Test + public void testToPaimonJdbcCatalog() { + String testUser = "testUser"; + String testPassword = "testPassword"; + String testUri = "testUri"; + Map catalogProperties = + ImmutableMap.of( + PaimonConstants.WAREHOUSE, + localWarehouse, + PaimonConstants.CATALOG_BACKEND, + "jdbc", + PaimonConstants.GRAVITINO_JDBC_USER, + testUser, + PaimonConstants.GRAVITINO_JDBC_PASSWORD, + testPassword, + PropertiesConverter.FLINK_PROPERTY_PREFIX + PaimonConstants.URI, + testUri); + Map flinkCatalogProperties = + CONVERTER.toFlinkCatalogProperties(catalogProperties); + Assertions.assertEquals( + GravitinoPaimonCatalogFactoryOptions.IDENTIFIER, flinkCatalogProperties.get("type")); + Assertions.assertEquals(localWarehouse, flinkCatalogProperties.get(PaimonConstants.WAREHOUSE)); + Assertions.assertEquals(testUser, flinkCatalogProperties.get(PaimonConstants.PAIMON_JDBC_USER)); + Assertions.assertEquals( + testPassword, flinkCatalogProperties.get(PaimonConstants.PAIMON_JDBC_PASSWORD)); + Assertions.assertEquals("jdbc", flinkCatalogProperties.get(PaimonConstants.METASTORE)); + Assertions.assertEquals(testUri, flinkCatalogProperties.get(PaimonConstants.URI)); + } + + @Test + public void testToGravitinoCatalogProperties() { + String testUser = "testUser"; + String testPassword = "testPassword"; + String testUri = "testUri"; + String testBackend = "jdbc"; + Configuration configuration = + Configuration.fromMap( + ImmutableMap.of( + PaimonConstants.WAREHOUSE, + localWarehouse, + PaimonConstants.METASTORE, + testBackend, + PaimonConstants.PAIMON_JDBC_USER, + testUser, + PaimonConstants.PAIMON_JDBC_PASSWORD, + testPassword, + PaimonConstants.URI, + testUri)); + Map properties = CONVERTER.toGravitinoCatalogProperties(configuration); + Assertions.assertEquals(localWarehouse, properties.get(PaimonConstants.WAREHOUSE)); + Assertions.assertEquals(testUser, properties.get(PaimonConstants.GRAVITINO_JDBC_USER)); + Assertions.assertEquals(testPassword, properties.get(PaimonConstants.GRAVITINO_JDBC_PASSWORD)); + Assertions.assertEquals(testUri, properties.get(PaimonConstants.URI)); + Assertions.assertEquals(testBackend, properties.get(PaimonConstants.CATALOG_BACKEND)); + } +} From 08f47ad4551913e7c05d2c5b08572cc342b78a5f Mon Sep 17 00:00:00 2001 From: Justin Mclean Date: Tue, 14 Jan 2025 10:34:27 +1100 Subject: [PATCH 15/40] [#6139] Refactor metalake command in Gravitino CLI (#6140) ### What changes were proposed in this pull request? The Gravitino command line class is a little large and could be broken up. ### Why are the changes needed? For readability and maintainability. Fix: #6139 ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? Tested locally. --------- Co-authored-by: Shaofeng Shi --- .../gravitino/cli/GravitinoCommandLine.java | 87 +------- .../gravitino/cli/MetalakeCommandHandler.java | 201 ++++++++++++++++++ 2 files changed, 202 insertions(+), 86 deletions(-) create mode 100644 clients/cli/src/main/java/org/apache/gravitino/cli/MetalakeCommandHandler.java diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java index 21d3ed176cb..cb8663ef379 100644 --- a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java @@ -137,7 +137,7 @@ private void executeCommand() { } else if (entity.equals(CommandEntities.CATALOG)) { new CatalogCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.METALAKE)) { - handleMetalakeCommand(); + new MetalakeCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.TOPIC)) { new TopicCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.FILESET)) { @@ -155,91 +155,6 @@ private void executeCommand() { } } - /** - * Handles the command execution for Metalakes based on command type and the command line options. - */ - private void handleMetalakeCommand() { - String url = getUrl(); - String auth = getAuth(); - String userName = line.getOptionValue(GravitinoOptions.LOGIN); - FullName name = new FullName(line); - String outputFormat = line.getOptionValue(GravitinoOptions.OUTPUT); - - Command.setAuthenticationMode(auth, userName); - - if (CommandActions.LIST.equals(command)) { - newListMetalakes(url, ignore, outputFormat).validate().handle(); - return; - } - - String metalake = name.getMetalakeName(); - - switch (command) { - case CommandActions.DETAILS: - if (line.hasOption(GravitinoOptions.AUDIT)) { - newMetalakeAudit(url, ignore, metalake).validate().handle(); - } else { - newMetalakeDetails(url, ignore, outputFormat, metalake).validate().handle(); - } - break; - - case CommandActions.CREATE: - String comment = line.getOptionValue(GravitinoOptions.COMMENT); - newCreateMetalake(url, ignore, metalake, comment).validate().handle(); - break; - - case CommandActions.DELETE: - boolean force = line.hasOption(GravitinoOptions.FORCE); - newDeleteMetalake(url, ignore, force, metalake).validate().handle(); - break; - - case CommandActions.SET: - String property = line.getOptionValue(GravitinoOptions.PROPERTY); - String value = line.getOptionValue(GravitinoOptions.VALUE); - newSetMetalakeProperty(url, ignore, metalake, property, value).validate().handle(); - break; - - case CommandActions.REMOVE: - property = line.getOptionValue(GravitinoOptions.PROPERTY); - newRemoveMetalakeProperty(url, ignore, metalake, property).validate().handle(); - break; - - case CommandActions.PROPERTIES: - newListMetalakeProperties(url, ignore, metalake).validate().handle(); - break; - - case CommandActions.UPDATE: - if (line.hasOption(GravitinoOptions.ENABLE) && line.hasOption(GravitinoOptions.DISABLE)) { - System.err.println(ErrorMessages.INVALID_ENABLE_DISABLE); - Main.exit(-1); - } - if (line.hasOption(GravitinoOptions.ENABLE)) { - boolean enableAllCatalogs = line.hasOption(GravitinoOptions.ALL); - newMetalakeEnable(url, ignore, metalake, enableAllCatalogs).validate().handle(); - } - if (line.hasOption(GravitinoOptions.DISABLE)) { - newMetalakeDisable(url, ignore, metalake).validate().handle(); - } - - if (line.hasOption(GravitinoOptions.COMMENT)) { - comment = line.getOptionValue(GravitinoOptions.COMMENT); - newUpdateMetalakeComment(url, ignore, metalake, comment).validate().handle(); - } - if (line.hasOption(GravitinoOptions.RENAME)) { - String newName = line.getOptionValue(GravitinoOptions.RENAME); - force = line.hasOption(GravitinoOptions.FORCE); - newUpdateMetalakeName(url, ignore, force, metalake, newName).validate().handle(); - } - - break; - - default: - System.err.println(ErrorMessages.UNSUPPORTED_COMMAND); - Main.exit(-1); - break; - } - } - /** Handles the command execution for Tags based on command type and the command line options. */ protected void handleTagCommand() { String url = getUrl(); diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/MetalakeCommandHandler.java b/clients/cli/src/main/java/org/apache/gravitino/cli/MetalakeCommandHandler.java new file mode 100644 index 00000000000..993116f19f5 --- /dev/null +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/MetalakeCommandHandler.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.cli; + +import org.apache.commons.cli.CommandLine; +import org.apache.gravitino.cli.commands.Command; + +/** + * Handles the command execution for Metalakes based on command type and the command line options. + */ +public class MetalakeCommandHandler extends CommandHandler { + + private final GravitinoCommandLine gravitinoCommandLine; + private final CommandLine line; + private final String command; + private final boolean ignore; + private final String url; + private String metalake; + + /** + * Constructs a MetalakeCommandHandler instance. + * + * @param gravitinoCommandLine The Gravitino command line instance. + * @param line The command line arguments. + * @param command The command to execute. + * @param ignore Ignore server version mismatch. + */ + public MetalakeCommandHandler( + GravitinoCommandLine gravitinoCommandLine, CommandLine line, String command, boolean ignore) { + this.gravitinoCommandLine = gravitinoCommandLine; + this.line = line; + this.command = command; + this.ignore = ignore; + this.url = getUrl(line); + } + + /** Handles the command execution logic based on the provided command. */ + public void handle() { + String userName = line.getOptionValue(GravitinoOptions.LOGIN); + FullName name = new FullName(line); + Command.setAuthenticationMode(getAuth(line), userName); + + if (CommandActions.LIST.equals(command)) { + handleListCommand(); + return; + } + + metalake = name.getMetalakeName(); + + if (!executeCommand()) { + System.err.println(ErrorMessages.UNSUPPORTED_COMMAND); + Main.exit(-1); + } + } + + /** + * Executes the specific command based on the command type. + * + * @return true if the command is supported, false otherwise + */ + private boolean executeCommand() { + switch (command) { + case CommandActions.DETAILS: + handleDetailsCommand(); + return true; + + case CommandActions.CREATE: + handleCreateCommand(); + return true; + + case CommandActions.DELETE: + handleDeleteCommand(); + return true; + + case CommandActions.SET: + handleSetCommand(); + return true; + + case CommandActions.REMOVE: + handleRemoveCommand(); + return true; + + case CommandActions.PROPERTIES: + handlePropertiesCommand(); + return true; + + case CommandActions.UPDATE: + handleUpdateCommand(); + return true; + + default: + return false; + } + } + + /** Handles the "LIST" command. */ + private void handleListCommand() { + String outputFormat = line.getOptionValue(GravitinoOptions.OUTPUT); + gravitinoCommandLine.newListMetalakes(url, ignore, outputFormat).validate().handle(); + } + + /** Handles the "DETAILS" command. */ + private void handleDetailsCommand() { + if (line.hasOption(GravitinoOptions.AUDIT)) { + gravitinoCommandLine.newMetalakeAudit(url, ignore, metalake).validate().handle(); + } else { + String outputFormat = line.getOptionValue(GravitinoOptions.OUTPUT); + gravitinoCommandLine + .newMetalakeDetails(url, ignore, outputFormat, metalake) + .validate() + .handle(); + } + } + + /** Handles the "CREATE" command. */ + private void handleCreateCommand() { + String comment = line.getOptionValue(GravitinoOptions.COMMENT); + gravitinoCommandLine.newCreateMetalake(url, ignore, metalake, comment).validate().handle(); + } + + /** Handles the "DELETE" command. */ + private void handleDeleteCommand() { + boolean force = line.hasOption(GravitinoOptions.FORCE); + gravitinoCommandLine.newDeleteMetalake(url, ignore, force, metalake).validate().handle(); + } + + /** Handles the "SET" command. */ + private void handleSetCommand() { + String property = line.getOptionValue(GravitinoOptions.PROPERTY); + String value = line.getOptionValue(GravitinoOptions.VALUE); + gravitinoCommandLine + .newSetMetalakeProperty(url, ignore, metalake, property, value) + .validate() + .handle(); + } + + /** Handles the "REMOVE" command. */ + private void handleRemoveCommand() { + String property = line.getOptionValue(GravitinoOptions.PROPERTY); + gravitinoCommandLine + .newRemoveMetalakeProperty(url, ignore, metalake, property) + .validate() + .handle(); + } + + /** Handles the "PROPERTIES" command. */ + private void handlePropertiesCommand() { + gravitinoCommandLine.newListMetalakeProperties(url, ignore, metalake).validate().handle(); + } + + /** Handles the "UPDATE" command. */ + private void handleUpdateCommand() { + if (line.hasOption(GravitinoOptions.ENABLE) && line.hasOption(GravitinoOptions.DISABLE)) { + System.err.println(ErrorMessages.INVALID_ENABLE_DISABLE); + Main.exit(-1); + } + if (line.hasOption(GravitinoOptions.ENABLE)) { + boolean enableAllCatalogs = line.hasOption(GravitinoOptions.ALL); + gravitinoCommandLine + .newMetalakeEnable(url, ignore, metalake, enableAllCatalogs) + .validate() + .handle(); + } + if (line.hasOption(GravitinoOptions.DISABLE)) { + gravitinoCommandLine.newMetalakeDisable(url, ignore, metalake).validate().handle(); + } + + if (line.hasOption(GravitinoOptions.COMMENT)) { + String comment = line.getOptionValue(GravitinoOptions.COMMENT); + gravitinoCommandLine + .newUpdateMetalakeComment(url, ignore, metalake, comment) + .validate() + .handle(); + } + if (line.hasOption(GravitinoOptions.RENAME)) { + String newName = line.getOptionValue(GravitinoOptions.RENAME); + boolean force = line.hasOption(GravitinoOptions.FORCE); + gravitinoCommandLine + .newUpdateMetalakeName(url, ignore, force, metalake, newName) + .validate() + .handle(); + } + } +} From 546a9771a44f423e6b0b16a550205b7ea689beb3 Mon Sep 17 00:00:00 2001 From: TungYuChiang <75083792+TungYuChiang@users.noreply.github.com> Date: Tue, 14 Jan 2025 07:36:53 +0800 Subject: [PATCH 16/40] =?UTF-8?q?[#6147]=20improve(CLI):=20Refactor=20file?= =?UTF-8?q?set=20commands=20in=20Gavitino=20CLI=C2=A0=20(#6191)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? Refactor fileset commands in cli client  ### Why are the changes needed? Fix: #6147 ### Does this PR introduce _any_ user-facing change? None ### How was this patch tested? Tested locally --- .../gravitino/cli/FilesetCommandHandler.java | 210 ++++++++++++++++++ .../gravitino/cli/GravitinoCommandLine.java | 106 +-------- 2 files changed, 211 insertions(+), 105 deletions(-) create mode 100644 clients/cli/src/main/java/org/apache/gravitino/cli/FilesetCommandHandler.java diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/FilesetCommandHandler.java b/clients/cli/src/main/java/org/apache/gravitino/cli/FilesetCommandHandler.java new file mode 100644 index 00000000000..33fc1fe9ee7 --- /dev/null +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/FilesetCommandHandler.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.cli; + +import com.google.common.collect.Lists; +import java.util.List; +import java.util.Map; +import org.apache.commons.cli.CommandLine; +import org.apache.gravitino.cli.commands.Command; + +/** + * Handles the command execution for Filesets based on command type and the command line options. + */ +public class FilesetCommandHandler extends CommandHandler { + private final GravitinoCommandLine gravitinoCommandLine; + private final CommandLine line; + private final String command; + private final boolean ignore; + private final String url; + private final FullName name; + private final String metalake; + private final String catalog; + private final String schema; + private String fileset; + + /** + * Constructs a {@link FilesetCommandHandler} instance. + * + * @param gravitinoCommandLine The Gravitino command line instance. + * @param line The command line arguments. + * @param command The command to execute. + * @param ignore Ignore server version mismatch. + */ + public FilesetCommandHandler( + GravitinoCommandLine gravitinoCommandLine, CommandLine line, String command, boolean ignore) { + this.gravitinoCommandLine = gravitinoCommandLine; + this.line = line; + this.command = command; + this.ignore = ignore; + + this.url = gravitinoCommandLine.getUrl(); + this.name = new FullName(line); + this.metalake = name.getMetalakeName(); + this.catalog = name.getCatalogName(); + this.schema = name.getSchemaName(); + } + + /** Handles the command execution logic based on the provided command. */ + @Override + protected void handle() { + String userName = line.getOptionValue(GravitinoOptions.LOGIN); + Command.setAuthenticationMode(gravitinoCommandLine.getAuth(), userName); + + List missingEntities = Lists.newArrayList(); + if (catalog == null) missingEntities.add(CommandEntities.CATALOG); + if (schema == null) missingEntities.add(CommandEntities.SCHEMA); + + if (CommandActions.LIST.equals(command)) { + checkEntities(missingEntities); + handleListCommand(); + return; + } + + this.fileset = name.getFilesetName(); + if (fileset == null) missingEntities.add(CommandEntities.FILESET); + checkEntities(missingEntities); + + if (!executeCommand()) { + System.err.println(ErrorMessages.UNSUPPORTED_ACTION); + Main.exit(-1); + } + } + + /** + * Executes the specific command based on the command type. + * + * @return true if the command is supported, false otherwise + */ + private boolean executeCommand() { + switch (command) { + case CommandActions.DETAILS: + handleDetailsCommand(); + return true; + + case CommandActions.CREATE: + handleCreateCommand(); + return true; + + case CommandActions.DELETE: + handleDeleteCommand(); + return true; + + case CommandActions.SET: + handleSetCommand(); + return true; + + case CommandActions.REMOVE: + handleRemoveCommand(); + return true; + + case CommandActions.PROPERTIES: + handlePropertiesCommand(); + return true; + + case CommandActions.UPDATE: + handleUpdateCommand(); + return true; + + default: + return false; + } + } + + /** Handles the "DETAILS" command. */ + private void handleDetailsCommand() { + gravitinoCommandLine + .newFilesetDetails(url, ignore, metalake, catalog, schema, fileset) + .validate() + .handle(); + } + + /** Handles the "CREATE" command. */ + private void handleCreateCommand() { + String comment = line.getOptionValue(GravitinoOptions.COMMENT); + String[] properties = line.getOptionValues(CommandActions.PROPERTIES); + Map propertyMap = new Properties().parse(properties); + gravitinoCommandLine + .newCreateFileset(url, ignore, metalake, catalog, schema, fileset, comment, propertyMap) + .validate() + .handle(); + } + + /** Handles the "DELETE" command. */ + private void handleDeleteCommand() { + boolean force = line.hasOption(GravitinoOptions.FORCE); + gravitinoCommandLine + .newDeleteFileset(url, ignore, force, metalake, catalog, schema, fileset) + .validate() + .handle(); + } + + /** Handles the "SET" command. */ + private void handleSetCommand() { + String property = line.getOptionValue(GravitinoOptions.PROPERTY); + String value = line.getOptionValue(GravitinoOptions.VALUE); + gravitinoCommandLine + .newSetFilesetProperty(url, ignore, metalake, catalog, schema, fileset, property, value) + .validate() + .handle(); + } + + /** Handles the "REMOVE" command. */ + private void handleRemoveCommand() { + String property = line.getOptionValue(GravitinoOptions.PROPERTY); + gravitinoCommandLine + .newRemoveFilesetProperty(url, ignore, metalake, catalog, schema, fileset, property) + .validate() + .handle(); + } + + /** Handles the "PROPERTIES" command. */ + private void handlePropertiesCommand() { + gravitinoCommandLine + .newListFilesetProperties(url, ignore, metalake, catalog, schema, fileset) + .validate() + .handle(); + } + + /** Handles the "LIST" command. */ + private void handleListCommand() { + gravitinoCommandLine + .newListFilesets(url, ignore, metalake, catalog, schema) + .validate() + .handle(); + } + + /** Handles the "UPDATE" command. */ + private void handleUpdateCommand() { + if (line.hasOption(GravitinoOptions.COMMENT)) { + String comment = line.getOptionValue(GravitinoOptions.COMMENT); + gravitinoCommandLine + .newUpdateFilesetComment(url, ignore, metalake, catalog, schema, fileset, comment) + .validate() + .handle(); + } + if (line.hasOption(GravitinoOptions.RENAME)) { + String newName = line.getOptionValue(GravitinoOptions.RENAME); + gravitinoCommandLine + .newUpdateFilesetName(url, ignore, metalake, catalog, schema, fileset, newName) + .validate() + .handle(); + } + } +} diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java index cb8663ef379..dd98ebf50d3 100644 --- a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java @@ -20,7 +20,6 @@ package org.apache.gravitino.cli; import com.google.common.base.Joiner; -import com.google.common.collect.Lists; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; @@ -28,7 +27,6 @@ import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.List; -import java.util.Map; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; @@ -141,7 +139,7 @@ private void executeCommand() { } else if (entity.equals(CommandEntities.TOPIC)) { new TopicCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.FILESET)) { - handleFilesetCommand(); + new FilesetCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.USER)) { new UserCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.GROUP)) { @@ -273,108 +271,6 @@ private void handleHelpCommand() { } } - /** - * Handles the command execution for filesets based on command type and the command line options. - */ - private void handleFilesetCommand() { - String url = getUrl(); - String auth = getAuth(); - String userName = line.getOptionValue(GravitinoOptions.LOGIN); - FullName name = new FullName(line); - String metalake = name.getMetalakeName(); - String catalog = name.getCatalogName(); - String schema = name.getSchemaName(); - - Command.setAuthenticationMode(auth, userName); - - List missingEntities = Lists.newArrayList(); - if (catalog == null) missingEntities.add(CommandEntities.CATALOG); - if (schema == null) missingEntities.add(CommandEntities.SCHEMA); - - // Handle CommandActions.LIST action separately as it doesn't require the `fileset` - if (CommandActions.LIST.equals(command)) { - checkEntities(missingEntities); - newListFilesets(url, ignore, metalake, catalog, schema).validate().handle(); - return; - } - - String fileset = name.getFilesetName(); - if (fileset == null) missingEntities.add(CommandEntities.FILESET); - checkEntities(missingEntities); - - switch (command) { - case CommandActions.DETAILS: - newFilesetDetails(url, ignore, metalake, catalog, schema, fileset).validate().handle(); - break; - - case CommandActions.CREATE: - { - String comment = line.getOptionValue(GravitinoOptions.COMMENT); - String[] properties = line.getOptionValues(CommandActions.PROPERTIES); - Map propertyMap = new Properties().parse(properties); - newCreateFileset(url, ignore, metalake, catalog, schema, fileset, comment, propertyMap) - .validate() - .handle(); - break; - } - - case CommandActions.DELETE: - { - boolean force = line.hasOption(GravitinoOptions.FORCE); - newDeleteFileset(url, ignore, force, metalake, catalog, schema, fileset) - .validate() - .handle(); - break; - } - - case CommandActions.SET: - { - String property = line.getOptionValue(GravitinoOptions.PROPERTY); - String value = line.getOptionValue(GravitinoOptions.VALUE); - newSetFilesetProperty(url, ignore, metalake, catalog, schema, fileset, property, value) - .validate() - .handle(); - break; - } - - case CommandActions.REMOVE: - { - String property = line.getOptionValue(GravitinoOptions.PROPERTY); - newRemoveFilesetProperty(url, ignore, metalake, catalog, schema, fileset, property) - .validate() - .handle(); - break; - } - - case CommandActions.PROPERTIES: - newListFilesetProperties(url, ignore, metalake, catalog, schema, fileset) - .validate() - .handle(); - break; - - case CommandActions.UPDATE: - { - if (line.hasOption(GravitinoOptions.COMMENT)) { - String comment = line.getOptionValue(GravitinoOptions.COMMENT); - newUpdateFilesetComment(url, ignore, metalake, catalog, schema, fileset, comment) - .validate() - .handle(); - } - if (line.hasOption(GravitinoOptions.RENAME)) { - String newName = line.getOptionValue(GravitinoOptions.RENAME); - newUpdateFilesetName(url, ignore, metalake, catalog, schema, fileset, newName) - .validate() - .handle(); - } - break; - } - - default: - System.err.println(ErrorMessages.UNSUPPORTED_ACTION); - break; - } - } - /** * Retrieves the Gravitinno URL from the command line options or the GRAVITINO_URL environment * variable or the Gravitio config file. From ea790d7e03aa9505734dd337f6e340c04a1d638a Mon Sep 17 00:00:00 2001 From: TengYao Chi Date: Tue, 14 Jan 2025 10:35:03 +0800 Subject: [PATCH 17/40] [#6152] refactor: Refactor tag commands in Gravitino CLI (#6192) ### What changes were proposed in this pull request? Reduce complexity in `GravitinoCommandLine` ### Why are the changes needed? For readability and maintainability. Fix: #6152 ### Does this PR introduce _any_ user-facing change? (Please list the user-facing changes introduced by your change, including None. ### How was this patch tested? Tested locally. --------- Co-authored-by: Justin Mclean --- .../gravitino/cli/GravitinoCommandLine.java | 104 +-------- .../gravitino/cli/TagCommandHandler.java | 207 ++++++++++++++++++ 2 files changed, 208 insertions(+), 103 deletions(-) create mode 100644 clients/cli/src/main/java/org/apache/gravitino/cli/TagCommandHandler.java diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java index dd98ebf50d3..11737206067 100644 --- a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java @@ -25,12 +25,10 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; -import java.util.Arrays; import java.util.List; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; -import org.apache.gravitino.cli.commands.Command; /* Gravitino Command line */ public class GravitinoCommandLine extends TestableCommandLine { @@ -145,7 +143,7 @@ private void executeCommand() { } else if (entity.equals(CommandEntities.GROUP)) { new GroupCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.TAG)) { - handleTagCommand(); + new TagCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.ROLE)) { new RoleCommandHandler(this, line, command, ignore).handle(); } else if (entity.equals(CommandEntities.MODEL)) { @@ -153,106 +151,6 @@ private void executeCommand() { } } - /** Handles the command execution for Tags based on command type and the command line options. */ - protected void handleTagCommand() { - String url = getUrl(); - String auth = getAuth(); - String userName = line.getOptionValue(GravitinoOptions.LOGIN); - FullName name = new FullName(line); - String metalake = name.getMetalakeName(); - - Command.setAuthenticationMode(auth, userName); - - String[] tags = line.getOptionValues(GravitinoOptions.TAG); - - if (tags != null) { - tags = Arrays.stream(tags).distinct().toArray(String[]::new); - } - - switch (command) { - case CommandActions.DETAILS: - newTagDetails(url, ignore, metalake, getOneTag(tags)).validate().handle(); - break; - - case CommandActions.LIST: - if (!name.hasCatalogName()) { - newListTags(url, ignore, metalake).validate().handle(); - } else { - newListEntityTags(url, ignore, metalake, name).validate().handle(); - } - break; - - case CommandActions.CREATE: - String comment = line.getOptionValue(GravitinoOptions.COMMENT); - newCreateTags(url, ignore, metalake, tags, comment).validate().handle(); - break; - - case CommandActions.DELETE: - boolean forceDelete = line.hasOption(GravitinoOptions.FORCE); - newDeleteTag(url, ignore, forceDelete, metalake, tags).validate().handle(); - break; - - case CommandActions.SET: - String propertySet = line.getOptionValue(GravitinoOptions.PROPERTY); - String valueSet = line.getOptionValue(GravitinoOptions.VALUE); - if (propertySet == null && valueSet == null) { - newTagEntity(url, ignore, metalake, name, tags).validate().handle(); - } else { - newSetTagProperty(url, ignore, metalake, getOneTag(tags), propertySet, valueSet) - .validate() - .handle(); - } - break; - - case CommandActions.REMOVE: - boolean isTag = line.hasOption(GravitinoOptions.TAG); - if (!isTag) { - boolean forceRemove = line.hasOption(GravitinoOptions.FORCE); - newRemoveAllTags(url, ignore, metalake, name, forceRemove).validate().handle(); - } else { - String propertyRemove = line.getOptionValue(GravitinoOptions.PROPERTY); - if (propertyRemove != null) { - newRemoveTagProperty(url, ignore, metalake, getOneTag(tags), propertyRemove) - .validate() - .handle(); - } else { - newUntagEntity(url, ignore, metalake, name, tags).validate().handle(); - } - } - break; - - case CommandActions.PROPERTIES: - newListTagProperties(url, ignore, metalake, getOneTag(tags)).validate().handle(); - break; - - case CommandActions.UPDATE: - if (line.hasOption(GravitinoOptions.COMMENT)) { - String updateComment = line.getOptionValue(GravitinoOptions.COMMENT); - newUpdateTagComment(url, ignore, metalake, getOneTag(tags), updateComment) - .validate() - .handle(); - } - if (line.hasOption(GravitinoOptions.RENAME)) { - String newName = line.getOptionValue(GravitinoOptions.RENAME); - newUpdateTagName(url, ignore, metalake, getOneTag(tags), newName).validate().handle(); - } - break; - - default: - System.err.println(ErrorMessages.UNSUPPORTED_ACTION); - Main.exit(-1); - break; - } - } - - private String getOneTag(String[] tags) { - if (tags == null || tags.length > 1) { - System.err.println(ErrorMessages.MULTIPLE_TAG_COMMAND_ERROR); - Main.exit(-1); - } - return tags[0]; - } - private void handleHelpCommand() { String helpFile = entity.toLowerCase() + "_help.txt"; diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/TagCommandHandler.java b/clients/cli/src/main/java/org/apache/gravitino/cli/TagCommandHandler.java new file mode 100644 index 00000000000..e274c271f9c --- /dev/null +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/TagCommandHandler.java @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.cli; + +import java.util.Arrays; +import org.apache.commons.cli.CommandLine; +import org.apache.gravitino.cli.commands.Command; + +public class TagCommandHandler extends CommandHandler { + private final GravitinoCommandLine gravitinoCommandLine; + private final CommandLine line; + private final String command; + private final boolean ignore; + private final String url; + private String[] tags; + private String metalake; + + public TagCommandHandler( + GravitinoCommandLine gravitinoCommandLine, CommandLine line, String command, boolean ignore) { + this.gravitinoCommandLine = gravitinoCommandLine; + this.line = line; + this.command = command; + this.ignore = ignore; + this.url = getUrl(line); + this.tags = line.getOptionValues(GravitinoOptions.TAG); + + if (tags != null) { + tags = Arrays.stream(tags).distinct().toArray(String[]::new); + } + } + + @Override + public void handle() { + String userName = line.getOptionValue(GravitinoOptions.LOGIN); + FullName name = new FullName(line); + Command.setAuthenticationMode(getAuth(line), userName); + + metalake = name.getMetalakeName(); + + if (!executeCommand()) { + System.err.println(ErrorMessages.UNSUPPORTED_COMMAND); + Main.exit(-1); + } + } + + /** + * Executes the specific command based on the command type. + * + * @return true if the command is supported, false otherwise + */ + private boolean executeCommand() { + switch (command) { + case CommandActions.DETAILS: + handleDetailsCommand(); + return true; + + case CommandActions.LIST: + handleListCommand(); + return true; + + case CommandActions.CREATE: + handleCreateCommand(); + return true; + + case CommandActions.DELETE: + handleDeleteCommand(); + return true; + + case CommandActions.SET: + handleSetCommand(); + return true; + + case CommandActions.REMOVE: + handleRemoveCommand(); + return true; + + case CommandActions.PROPERTIES: + handlePropertiesCommand(); + return true; + + case CommandActions.UPDATE: + handleUpdateCommand(); + return true; + + default: + return false; + } + } + + /** Handles the "LIST" command. */ + private void handleListCommand() { + FullName name = new FullName(line); + if (!name.hasCatalogName()) { + gravitinoCommandLine.newListTags(url, ignore, metalake).validate().handle(); + } else { + gravitinoCommandLine.newListEntityTags(url, ignore, metalake, name).validate().handle(); + } + } + + /** Handles the "DETAILS" command. */ + private void handleDetailsCommand() { + gravitinoCommandLine.newTagDetails(url, ignore, metalake, getOneTag(tags)).validate().handle(); + } + + /** Handles the "CREATE" command. */ + private void handleCreateCommand() { + String comment = line.getOptionValue(GravitinoOptions.COMMENT); + gravitinoCommandLine.newCreateTags(url, ignore, metalake, tags, comment).validate().handle(); + } + + /** Handles the "DELETE" command. */ + private void handleDeleteCommand() { + boolean forceDelete = line.hasOption(GravitinoOptions.FORCE); + gravitinoCommandLine.newDeleteTag(url, ignore, forceDelete, metalake, tags).validate().handle(); + } + + /** Handles the "SET" command. */ + private void handleSetCommand() { + String property = line.getOptionValue(GravitinoOptions.PROPERTY); + String value = line.getOptionValue(GravitinoOptions.VALUE); + if (property == null && value == null) { + gravitinoCommandLine + .newTagEntity(url, ignore, metalake, new FullName(line), tags) + .validate() + .handle(); + } else { + gravitinoCommandLine + .newSetTagProperty(url, ignore, metalake, getOneTag(tags), property, value) + .validate() + .handle(); + } + } + + /** Handles the "REMOVE" command. */ + private void handleRemoveCommand() { + boolean isTag = line.hasOption(GravitinoOptions.TAG); + FullName name = new FullName(line); + if (!isTag) { + boolean forceRemove = line.hasOption(GravitinoOptions.FORCE); + gravitinoCommandLine + .newRemoveAllTags(url, ignore, metalake, name, forceRemove) + .validate() + .handle(); + } else { + String propertyRemove = line.getOptionValue(GravitinoOptions.PROPERTY); + if (propertyRemove != null) { + gravitinoCommandLine + .newRemoveTagProperty(url, ignore, metalake, getOneTag(tags), propertyRemove) + .validate() + .handle(); + } else { + gravitinoCommandLine.newUntagEntity(url, ignore, metalake, name, tags).validate().handle(); + } + } + } + + /** Handles the "PROPERTIES" command. */ + private void handlePropertiesCommand() { + gravitinoCommandLine + .newListTagProperties(url, ignore, metalake, getOneTag(tags)) + .validate() + .handle(); + } + + /** Handles the "UPDATE" command. */ + private void handleUpdateCommand() { + + if (line.hasOption(GravitinoOptions.COMMENT)) { + String updateComment = line.getOptionValue(GravitinoOptions.COMMENT); + gravitinoCommandLine + .newUpdateTagComment(url, ignore, metalake, getOneTag(tags), updateComment) + .validate() + .handle(); + } + if (line.hasOption(GravitinoOptions.RENAME)) { + String newName = line.getOptionValue(GravitinoOptions.RENAME); + gravitinoCommandLine + .newUpdateTagName(url, ignore, metalake, getOneTag(tags), newName) + .validate() + .handle(); + } + } + + private String getOneTag(String[] tags) { + if (tags == null || tags.length > 1) { + System.err.println(ErrorMessages.MULTIPLE_TAG_COMMAND_ERROR); + Main.exit(-1); + } + return tags[0]; + } +} From 63f9ae6b2dbe777b79157fa5e3df62f0aa9e70f2 Mon Sep 17 00:00:00 2001 From: Qi Yu Date: Tue, 14 Jan 2025 14:03:41 +0800 Subject: [PATCH 18/40] [#5361] improvment(hadoop-catalog): Introduce timeout mechanism to get Hadoop File System. (#5406) ### What changes were proposed in this pull request? Introduce a timeout mechanism when getting a Hadoop FileSystem instance. ### Why are the changes needed? Cloud filesystem like S3 and OSS(10 minutes) has a very long connection and can't be tune by configuration, this will cause deadlock as it will hold the tree lock for a long time Fix: #5361 Fix: #6156 ### Does this PR introduce _any_ user-facing change? N/A. ### How was this patch tested? Existing test. --- LICENSE.bin | 1 + catalogs/catalog-hadoop/build.gradle.kts | 1 + .../hadoop/HadoopCatalogOperations.java | 35 ++++++++++++++++++- .../HadoopCatalogPropertiesMetadata.java | 11 ++++++ .../apache/gravitino/lock/LockManager.java | 9 +++-- docs/hadoop-catalog.md | 9 ++--- 6 files changed, 59 insertions(+), 7 deletions(-) diff --git a/LICENSE.bin b/LICENSE.bin index effaa4ac4a2..d1dddd52795 100644 --- a/LICENSE.bin +++ b/LICENSE.bin @@ -374,6 +374,7 @@ Apache Arrow Rome Jettison + Awaitility This product bundles various third-party components also under the Apache Software Foundation License 1.1 diff --git a/catalogs/catalog-hadoop/build.gradle.kts b/catalogs/catalog-hadoop/build.gradle.kts index d599a5e72f1..3108d993c1a 100644 --- a/catalogs/catalog-hadoop/build.gradle.kts +++ b/catalogs/catalog-hadoop/build.gradle.kts @@ -54,6 +54,7 @@ dependencies { exclude("org.fusesource.leveldbjni") } implementation(libs.slf4j.api) + implementation(libs.awaitility) compileOnly(libs.guava) diff --git a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogOperations.java b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogOperations.java index 36177bea37f..6c032414be5 100644 --- a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogOperations.java +++ b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogOperations.java @@ -31,6 +31,8 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; import org.apache.commons.lang3.StringUtils; import org.apache.gravitino.Catalog; import org.apache.gravitino.Entity; @@ -71,6 +73,8 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.awaitility.Awaitility; +import org.awaitility.core.ConditionTimeoutException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -755,6 +759,35 @@ FileSystem getFileSystem(Path path, Map config) throws IOExcepti scheme, path, fileSystemProvidersMap.keySet(), fileSystemProvidersMap.values())); } - return provider.getFileSystem(path, config); + int timeoutSeconds = + (int) + propertiesMetadata + .catalogPropertiesMetadata() + .getOrDefault( + config, HadoopCatalogPropertiesMetadata.FILESYSTEM_CONNECTION_TIMEOUT_SECONDS); + try { + AtomicReference fileSystem = new AtomicReference<>(); + Awaitility.await() + .atMost(timeoutSeconds, TimeUnit.SECONDS) + .until( + () -> { + fileSystem.set(provider.getFileSystem(path, config)); + return true; + }); + return fileSystem.get(); + } catch (ConditionTimeoutException e) { + throw new IOException( + String.format( + "Failed to get FileSystem for path: %s, scheme: %s, provider: %s, config: %s within %s " + + "seconds, please check the configuration or increase the " + + "file system connection timeout time by setting catalog property: %s", + path, + scheme, + provider, + config, + timeoutSeconds, + HadoopCatalogPropertiesMetadata.FILESYSTEM_CONNECTION_TIMEOUT_SECONDS), + e); + } } } diff --git a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogPropertiesMetadata.java b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogPropertiesMetadata.java index 22cf0d5b2cd..3bdc125efc8 100644 --- a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogPropertiesMetadata.java +++ b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogPropertiesMetadata.java @@ -53,6 +53,9 @@ public class HadoopCatalogPropertiesMetadata extends BaseCatalogPropertiesMetada */ public static final String DEFAULT_FS_PROVIDER = "default-filesystem-provider"; + static final String FILESYSTEM_CONNECTION_TIMEOUT_SECONDS = "filesystem-conn-timeout-secs"; + static final int DEFAULT_GET_FILESYSTEM_TIMEOUT_SECONDS = 6; + public static final String BUILTIN_LOCAL_FS_PROVIDER = "builtin-local"; public static final String BUILTIN_HDFS_FS_PROVIDER = "builtin-hdfs"; @@ -82,6 +85,14 @@ public class HadoopCatalogPropertiesMetadata extends BaseCatalogPropertiesMetada false /* immutable */, BUILTIN_LOCAL_FS_PROVIDER, // please see LocalFileSystemProvider#name() false /* hidden */)) + .put( + FILESYSTEM_CONNECTION_TIMEOUT_SECONDS, + PropertyEntry.integerOptionalPropertyEntry( + FILESYSTEM_CONNECTION_TIMEOUT_SECONDS, + "Timeout to wait for to create the Hadoop file system client instance.", + false /* immutable */, + DEFAULT_GET_FILESYSTEM_TIMEOUT_SECONDS, + false /* hidden */)) // The following two are about authentication. .putAll(KERBEROS_PROPERTY_ENTRIES) .putAll(AuthenticationConfig.AUTHENTICATION_PROPERTY_ENTRIES) diff --git a/core/src/main/java/org/apache/gravitino/lock/LockManager.java b/core/src/main/java/org/apache/gravitino/lock/LockManager.java index 222dee8daad..d52c858cc43 100644 --- a/core/src/main/java/org/apache/gravitino/lock/LockManager.java +++ b/core/src/main/java/org/apache/gravitino/lock/LockManager.java @@ -26,6 +26,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; import com.google.common.util.concurrent.ThreadFactoryBuilder; +import java.text.SimpleDateFormat; import java.util.List; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; @@ -136,10 +137,14 @@ void checkDeadLock(TreeLockNode node) { // If the thread is holding the lock for more than 30 seconds, we will log it. if (System.currentTimeMillis() - ts > 30000) { LOG.warn( - "Dead lock detected for thread with identifier {} on node {}, threads that holding the node: {} ", + "Thread with identifier {} holds the lock node {} for more than 30s since {}, please " + + "check if some dead lock or thread hang like io-connection hangs", threadIdentifier, node, - node.getHoldingThreadTimestamp()); + // SimpleDateFormat is not thread-safe, so we should create a new instance for + // each time + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") + .format(node.getHoldingThreadTimestamp())); } }); } diff --git a/docs/hadoop-catalog.md b/docs/hadoop-catalog.md index 99e1dd7854e..cbdae846899 100644 --- a/docs/hadoop-catalog.md +++ b/docs/hadoop-catalog.md @@ -23,10 +23,11 @@ Hadoop 3. If there's any compatibility issue, please create an [issue](https://g Besides the [common catalog properties](./gravitino-server-config.md#apache-gravitino-catalog-properties-configuration), the Hadoop catalog has the following properties: -| Property Name | Description | Default Value | Required | Since Version | -|------------------------|----------------------------------------------------|---------------|----------|------------------| -| `location` | The storage location managed by Hadoop catalog. | (none) | No | 0.5.0 | -| `credential-providers` | The credential provider types, separated by comma. | (none) | No | 0.8.0-incubating | +| Property Name | Description | Default Value | Required | Since Version | +|--------------------------------|-----------------------------------------------------------------------------------------------------|---------------|----------|------------------| +| `location` | The storage location managed by Hadoop catalog. | (none) | No | 0.5.0 | +| `filesystem-conn-timeout-secs` | The timeout of getting the file system using Hadoop FileSystem client instance. Time unit: seconds. | 6 | No | 0.8.0-incubating | +| `credential-providers` | The credential provider types, separated by comma. | (none) | No | 0.8.0-incubating | Please refer to [Credential vending](./security/credential-vending.md) for more details about credential vending. From c6476b85432428ed958516a2024e52f8226a6536 Mon Sep 17 00:00:00 2001 From: Yuhui Date: Tue, 14 Jan 2025 15:59:06 +0800 Subject: [PATCH 19/40] [#6131] feat (gvfs-fuse): Add integration test framework of gvfs-fuse (#6160) ### What changes were proposed in this pull request? Add integration test framework of gvfs-fuse Integrate LocalStack into the gvfs-fuse integration test Add ci pipeline for integration test ### Why are the changes needed? Fix: #6131 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? IT --- .github/workflows/gvfs-fuse-build-test.yml | 14 ++- clients/filesystem-fuse/Makefile | 6 + .../src/default_raw_filesystem.rs | 15 ++- .../filesystem-fuse/src/gravitino_client.rs | 26 +++- .../src/gravitino_fileset_filesystem.rs | 81 +++++++++++- clients/filesystem-fuse/src/gvfs_creator.rs | 10 +- clients/filesystem-fuse/src/lib.rs | 13 ++ .../src/open_dal_filesystem.rs | 47 +++++-- clients/filesystem-fuse/src/s3_filesystem.rs | 113 +++++++++-------- clients/filesystem-fuse/tests/bin/env.sh | 65 ++++++++++ .../tests/bin/gravitino_server.sh | 116 ++++++++++++++++++ .../filesystem-fuse/tests/bin/gvfs_fuse.sh | 65 ++++++++++ .../filesystem-fuse/tests/bin/localstatck.sh | 46 +++++++ .../tests/bin/run_fuse_testers.sh | 70 +++++++++++ .../tests/bin/run_s3fs_testers.sh | 64 ++++++++++ .../tests/conf/gvfs_fuse_s3.toml | 3 +- clients/filesystem-fuse/tests/fuse_test.rs | 22 ++-- 17 files changed, 696 insertions(+), 80 deletions(-) create mode 100644 clients/filesystem-fuse/tests/bin/env.sh create mode 100644 clients/filesystem-fuse/tests/bin/gravitino_server.sh create mode 100644 clients/filesystem-fuse/tests/bin/gvfs_fuse.sh create mode 100644 clients/filesystem-fuse/tests/bin/localstatck.sh create mode 100755 clients/filesystem-fuse/tests/bin/run_fuse_testers.sh create mode 100644 clients/filesystem-fuse/tests/bin/run_s3fs_testers.sh diff --git a/.github/workflows/gvfs-fuse-build-test.yml b/.github/workflows/gvfs-fuse-build-test.yml index 4af01d82da3..4fe7b66e09d 100644 --- a/.github/workflows/gvfs-fuse-build-test.yml +++ b/.github/workflows/gvfs-fuse-build-test.yml @@ -71,10 +71,18 @@ jobs: run: | dev/ci/check_commands.sh - - name: Build and test Gravitino + - name: Build Gvfs-fuse run: | ./gradlew :clients:filesystem-fuse:build -PenableFuse=true + - name: Integration test + run: | + ./gradlew build -x :clients:client-python:build -x test -x web -PjdkVersion=${{ matrix.java-version }} + ./gradlew compileDistribution -x :clients:client-python:build -x test -x web -PjdkVersion=${{ matrix.java-version }} + cd clients/filesystem-fuse + make test-s3 + make test-fuse-it + - name: Free up disk space run: | dev/ci/util_free_space.sh @@ -85,5 +93,7 @@ jobs: with: name: Gvfs-fuse integrate-test-reports-${{ matrix.java-version }} path: | - clients/filesystem-fuse/build/test/log/*.log + clients/filesystem-fuse/target/debug/fuse.log + distribution/package/logs/gravitino-server.out + distribution/package/logs/gravitino-server.log diff --git a/clients/filesystem-fuse/Makefile b/clients/filesystem-fuse/Makefile index f4a4cef20ae..86dd2f22152 100644 --- a/clients/filesystem-fuse/Makefile +++ b/clients/filesystem-fuse/Makefile @@ -62,6 +62,12 @@ doc-test: unit-test: doc-test cargo test --no-fail-fast --lib --all-features --workspace +test-fuse-it: + @bash ./tests/bin/run_fuse_testers.sh test + +test-s3: + @bash ./tests/bin/run_s3fs_testers.sh test + test: doc-test cargo test --no-fail-fast --all-targets --all-features --workspace diff --git a/clients/filesystem-fuse/src/default_raw_filesystem.rs b/clients/filesystem-fuse/src/default_raw_filesystem.rs index 944181246d5..d1d8e7605df 100644 --- a/clients/filesystem-fuse/src/default_raw_filesystem.rs +++ b/clients/filesystem-fuse/src/default_raw_filesystem.rs @@ -334,13 +334,22 @@ impl RawFileSystem for DefaultRawFileSystem { file.flush().await } - async fn close_file(&self, _file_id: u64, fh: u64) -> Result<()> { + async fn close_file(&self, file_id: u64, fh: u64) -> Result<()> { + let file_entry = self.get_file_entry(file_id).await; + let opened_file = self .opened_file_manager .remove(fh) .ok_or(Errno::from(libc::EBADF))?; - let mut file = opened_file.lock().await; - file.close().await + + // todo: need to handle racing condition and corner case when the file has been deleted. + if file_entry.is_ok() { + let mut file = opened_file.lock().await; + file.close().await + } else { + // If the file has been deleted, it does not cause a leak even if it has not been closed. + Ok(()) + } } async fn read(&self, file_id: u64, fh: u64, offset: u64, size: u32) -> Result { diff --git a/clients/filesystem-fuse/src/gravitino_client.rs b/clients/filesystem-fuse/src/gravitino_client.rs index 9bdfbb2c288..1e1cd411eac 100644 --- a/clients/filesystem-fuse/src/gravitino_client.rs +++ b/clients/filesystem-fuse/src/gravitino_client.rs @@ -199,10 +199,34 @@ impl GravitinoClient { } #[cfg(test)] -mod tests { +pub(crate) mod tests { use super::*; use mockito::mock; + pub(crate) fn create_test_catalog( + name: &str, + provider: &str, + properties: HashMap, + ) -> Catalog { + Catalog { + name: name.to_string(), + catalog_type: "fileset".to_string(), + provider: provider.to_string(), + comment: "".to_string(), + properties: properties, + } + } + + pub(crate) fn create_test_fileset(name: &str, storage_location: &str) -> Fileset { + Fileset { + name: name.to_string(), + fileset_type: "managed".to_string(), + comment: "".to_string(), + storage_location: storage_location.to_string(), + properties: HashMap::default(), + } + } + #[tokio::test] async fn test_get_fileset_success() { let fileset_response = r#" diff --git a/clients/filesystem-fuse/src/gravitino_fileset_filesystem.rs b/clients/filesystem-fuse/src/gravitino_fileset_filesystem.rs index 7da2f572dcc..04236dfe841 100644 --- a/clients/filesystem-fuse/src/gravitino_fileset_filesystem.rs +++ b/clients/filesystem-fuse/src/gravitino_fileset_filesystem.rs @@ -140,16 +140,27 @@ impl PathFileSystem for GravitinoFilesetFileSystem { #[cfg(test)] mod tests { - use crate::config::GravitinoConfig; + use crate::config::{AppConfig, GravitinoConfig}; + use crate::default_raw_filesystem::DefaultRawFileSystem; + use crate::filesystem::tests::{TestPathFileSystem, TestRawFileSystem}; + use crate::filesystem::{FileSystemContext, PathFileSystem, RawFileSystem}; + use crate::gravitino_client::tests::{create_test_catalog, create_test_fileset}; + use crate::gravitino_client::GravitinoClient; use crate::gravitino_fileset_filesystem::GravitinoFilesetFileSystem; + use crate::gvfs_creator::create_fs_with_fileset; use crate::memory_filesystem::MemoryFileSystem; + use crate::s3_filesystem::extract_s3_config; + use crate::s3_filesystem::tests::{cleanup_s3_fs, s3_test_config}; + use crate::test_enable_with; + use crate::RUN_TEST_WITH_S3; + use std::collections::HashMap; use std::path::Path; #[tokio::test] async fn test_map_fileset_path_to_raw_path() { let fs = GravitinoFilesetFileSystem { physical_fs: Box::new(MemoryFileSystem::new().await), - client: super::GravitinoClient::new(&GravitinoConfig::default()), + client: GravitinoClient::new(&GravitinoConfig::default()), location: "/c1/fileset1".into(), }; let path = fs.gvfs_path_to_raw_path(Path::new("/a")); @@ -162,7 +173,7 @@ mod tests { async fn test_map_raw_path_to_fileset_path() { let fs = GravitinoFilesetFileSystem { physical_fs: Box::new(MemoryFileSystem::new().await), - client: super::GravitinoClient::new(&GravitinoConfig::default()), + client: GravitinoClient::new(&GravitinoConfig::default()), location: "/c1/fileset1".into(), }; let path = fs @@ -172,4 +183,68 @@ mod tests { let path = fs.raw_path_to_gvfs_path(Path::new("/c1/fileset1")).unwrap(); assert_eq!(path, Path::new("/")); } + + async fn create_fileset_fs(path: &Path, config: &AppConfig) -> GravitinoFilesetFileSystem { + let opendal_config = extract_s3_config(config); + + cleanup_s3_fs(path, &opendal_config).await; + + let bucket = opendal_config.get("bucket").expect("Bucket must exist"); + let endpoint = opendal_config.get("endpoint").expect("Endpoint must exist"); + + let catalog = create_test_catalog( + "c1", + "s3", + vec![ + ("location".to_string(), format!("s3a://{}", bucket)), + ("s3-endpoint".to_string(), endpoint.to_string()), + ] + .into_iter() + .collect::>(), + ); + let file_set_location = format!("s3a://{}{}", bucket, path.to_string_lossy()); + let file_set = create_test_fileset("fileset1", &file_set_location); + + let fs_context = FileSystemContext::default(); + let inner_fs = create_fs_with_fileset(&catalog, &file_set, config, &fs_context) + .await + .unwrap(); + GravitinoFilesetFileSystem::new( + inner_fs, + path, + GravitinoClient::new(&config.gravitino), + config, + &fs_context, + ) + .await + } + + #[tokio::test] + async fn s3_ut_test_fileset_file_system() { + test_enable_with!(RUN_TEST_WITH_S3); + + let config = s3_test_config(); + let cwd = Path::new("/gvfs_test3"); + let fs = create_fileset_fs(cwd, &config).await; + let _ = fs.init().await; + let mut tester = TestPathFileSystem::new(Path::new("/"), fs); + tester.test_path_file_system().await; + } + + #[tokio::test] + async fn s3_ut_test_fileset_with_raw_file_system() { + test_enable_with!(RUN_TEST_WITH_S3); + + let config = s3_test_config(); + let cwd = Path::new("/gvfs_test4"); + let fileset_fs = create_fileset_fs(cwd, &config).await; + let raw_fs = DefaultRawFileSystem::new( + fileset_fs, + &AppConfig::default(), + &FileSystemContext::default(), + ); + let _ = raw_fs.init().await; + let mut tester = TestRawFileSystem::new(Path::new("/"), raw_fs); + tester.test_raw_file_system().await; + } } diff --git a/clients/filesystem-fuse/src/gvfs_creator.rs b/clients/filesystem-fuse/src/gvfs_creator.rs index aac88ad9d08..88bc8a1b422 100644 --- a/clients/filesystem-fuse/src/gvfs_creator.rs +++ b/clients/filesystem-fuse/src/gvfs_creator.rs @@ -87,7 +87,7 @@ pub async fn create_gvfs_filesystem( .get_fileset(&catalog_name, &schema_name, &fileset_name) .await?; - let inner_fs = create_fs_with_fileset(&catalog, &fileset, config, fs_context)?; + let inner_fs = create_fs_with_fileset(&catalog, &fileset, config, fs_context).await?; let target_path = extract_root_path(fileset.storage_location.as_str())?; let fs = @@ -95,7 +95,7 @@ pub async fn create_gvfs_filesystem( Ok(CreateFileSystemResult::Gvfs(fs)) } -fn create_fs_with_fileset( +pub(crate) async fn create_fs_with_fileset( catalog: &Catalog, fileset: &Fileset, config: &AppConfig, @@ -104,9 +104,9 @@ fn create_fs_with_fileset( let schema = extract_filesystem_scheme(&fileset.storage_location)?; match schema { - FileSystemSchema::S3 => Ok(Box::new(S3FileSystem::new( - catalog, fileset, config, fs_context, - )?)), + FileSystemSchema::S3 => Ok(Box::new( + S3FileSystem::new(catalog, fileset, config, fs_context).await?, + )), } } diff --git a/clients/filesystem-fuse/src/lib.rs b/clients/filesystem-fuse/src/lib.rs index 31e7c7fd8e1..41a9a5335d5 100644 --- a/clients/filesystem-fuse/src/lib.rs +++ b/clients/filesystem-fuse/src/lib.rs @@ -36,6 +36,19 @@ mod opened_file_manager; mod s3_filesystem; mod utils; +#[macro_export] +macro_rules! test_enable_with { + ($env_var:expr) => { + if std::env::var($env_var).is_err() { + println!("Test skipped because {} is not set", $env_var); + return; + } + }; +} + +pub const RUN_TEST_WITH_S3: &str = "RUN_TEST_WITH_S3"; +pub const RUN_TEST_WITH_FUSE: &str = "RUN_TEST_WITH_FUSE"; + pub async fn gvfs_mount(mount_to: &str, mount_from: &str, config: &AppConfig) -> GvfsResult<()> { gvfs_fuse::mount(mount_to, mount_from, config).await } diff --git a/clients/filesystem-fuse/src/open_dal_filesystem.rs b/clients/filesystem-fuse/src/open_dal_filesystem.rs index e53fbaf6032..d32b014d1f0 100644 --- a/clients/filesystem-fuse/src/open_dal_filesystem.rs +++ b/clients/filesystem-fuse/src/open_dal_filesystem.rs @@ -261,22 +261,29 @@ fn opendal_filemode_to_filetype(mode: EntryMode) -> FileType { mod test { use crate::config::AppConfig; use crate::s3_filesystem::extract_s3_config; + use crate::s3_filesystem::tests::s3_test_config; + use crate::test_enable_with; + use crate::RUN_TEST_WITH_S3; use opendal::layers::LoggingLayer; use opendal::{services, Builder, Operator}; - #[tokio::test] - async fn test_s3_stat() { - let config = AppConfig::from_file(Some("tests/conf/gvfs_fuse_s3.toml")).unwrap(); - let opendal_config = extract_s3_config(&config); - + fn create_opendal(config: &AppConfig) -> Operator { + let opendal_config = extract_s3_config(config); let builder = services::S3::from_map(opendal_config); // Init an operator - let op = Operator::new(builder) + Operator::new(builder) .expect("opendal create failed") .layer(LoggingLayer::default()) - .finish(); + .finish() + } + + #[tokio::test] + async fn s3_ut_test_s3_stat() { + test_enable_with!(RUN_TEST_WITH_S3); + let config = s3_test_config(); + let op = create_opendal(&config); let path = "/"; let list = op.list(path).await; if let Ok(l) = list { @@ -294,4 +301,30 @@ mod test { println!("stat error: {:?}", meta.err()); } } + + #[tokio::test] + async fn s3_ut_test_s3_delete() { + test_enable_with!(RUN_TEST_WITH_S3); + let config = s3_test_config(); + + let op = create_opendal(&config); + let path = "/s1/fileset1/gvfs_test/test_dir/test_file"; + + let meta = op.stat(path).await; + if let Ok(m) = meta { + println!("stat result: {:?}", m); + } else { + println!("stat error: {:?}", meta.err()); + } + + let result = op.remove(vec![path.to_string()]).await; + match result { + Ok(_) => { + println!("Delete successful (or no-op)."); + } + Err(e) => { + println!("Delete failed: {:?}", e); + } + } + } } diff --git a/clients/filesystem-fuse/src/s3_filesystem.rs b/clients/filesystem-fuse/src/s3_filesystem.rs index e0ca69b4ccf..35a091b3fe1 100644 --- a/clients/filesystem-fuse/src/s3_filesystem.rs +++ b/clients/filesystem-fuse/src/s3_filesystem.rs @@ -40,7 +40,7 @@ impl S3FileSystem {} impl S3FileSystem { const S3_CONFIG_PREFIX: &'static str = "s3-"; - pub(crate) fn new( + pub(crate) async fn new( catalog: &Catalog, fileset: &Fileset, config: &AppConfig, @@ -48,10 +48,20 @@ impl S3FileSystem { ) -> GvfsResult { let mut opendal_config = extract_s3_config(config); let bucket = extract_bucket(&fileset.storage_location)?; - opendal_config.insert("bucket".to_string(), bucket); + opendal_config.insert("bucket".to_string(), bucket.to_string()); - let region = Self::get_s3_region(catalog)?; - opendal_config.insert("region".to_string(), region); + let endpoint = catalog.properties.get("s3-endpoint"); + if endpoint.is_none() { + return Err(InvalidConfig.to_error("s3-endpoint is required".to_string())); + } + let endpoint = endpoint.unwrap(); + opendal_config.insert("endpoint".to_string(), endpoint.clone()); + + let region = Self::get_s3_region(catalog, &bucket).await; + if region.is_none() { + return Err(InvalidConfig.to_error("s3-region is required".to_string())); + } + opendal_config.insert("region".to_string(), region.unwrap()); let builder = S3::from_map(opendal_config); @@ -67,16 +77,13 @@ impl S3FileSystem { }) } - fn get_s3_region(catalog: &Catalog) -> GvfsResult { + async fn get_s3_region(catalog: &Catalog, bucket: &str) -> Option { if let Some(region) = catalog.properties.get("s3-region") { - Ok(region.clone()) + Some(region.clone()) } else if let Some(endpoint) = catalog.properties.get("s3-endpoint") { - extract_region(endpoint) + S3::detect_region(endpoint, bucket).await } else { - Err(InvalidConfig.to_error(format!( - "Cant not retrieve region in the Catalog {}", - catalog.name - ))) + None } } } @@ -139,25 +146,11 @@ pub(crate) fn extract_bucket(location: &str) -> GvfsResult { } } -pub(crate) fn extract_region(location: &str) -> GvfsResult { - let url = parse_location(location)?; - match url.host_str() { - Some(host) => { - let parts: Vec<&str> = host.split('.').collect(); - if parts.len() > 1 { - Ok(parts[1].to_string()) - } else { - Err(InvalidConfig.to_error(format!( - "Invalid location: expected region in host, got {}", - location - ))) - } - } - None => Err(InvalidConfig.to_error(format!( - "Invalid fileset location without bucket: {}", - location - ))), - } +pub(crate) fn extract_region(location: &str) -> Option { + parse_location(location).ok().and_then(|url| { + url.host_str() + .and_then(|host| host.split('.').nth(1).map(|part| part.to_string())) + }) } pub fn extract_s3_config(config: &AppConfig) -> HashMap { @@ -181,11 +174,13 @@ pub fn extract_s3_config(config: &AppConfig) -> HashMap { } #[cfg(test)] -mod tests { +pub(crate) mod tests { use super::*; use crate::default_raw_filesystem::DefaultRawFileSystem; use crate::filesystem::tests::{TestPathFileSystem, TestRawFileSystem}; use crate::filesystem::RawFileSystem; + use crate::test_enable_with; + use crate::RUN_TEST_WITH_S3; use opendal::layers::TimeoutLayer; use std::time::Duration; @@ -201,11 +196,11 @@ mod tests { fn test_extract_region() { let location = "http://s3.ap-southeast-2.amazonaws.com"; let result = extract_region(location); - assert!(result.is_ok()); + assert!(result.is_some()); assert_eq!(result.unwrap(), "ap-southeast-2"); } - async fn delete_dir(op: &Operator, dir_name: &str) { + pub(crate) async fn delete_dir(op: &Operator, dir_name: &str) { let childs = op.list(dir_name).await.expect("list dir failed"); for child in childs { let child_name = dir_name.to_string() + child.name(); @@ -218,13 +213,11 @@ mod tests { op.delete(dir_name).await.expect("delete dir failed"); } - async fn create_s3_fs(cwd: &Path) -> S3FileSystem { - let config = AppConfig::from_file(Some("tests/conf/gvfs_fuse_s3.toml")).unwrap(); - let opendal_config = extract_s3_config(&config); - - let fs_context = FileSystemContext::default(); - - let builder = S3::from_map(opendal_config); + pub(crate) async fn cleanup_s3_fs( + cwd: &Path, + opendal_config: &HashMap, + ) -> Operator { + let builder = S3::from_map(opendal_config.clone()); let op = Operator::new(builder) .expect("opendal create failed") .layer(LoggingLayer::default()) @@ -241,18 +234,37 @@ mod tests { op.create_dir(&file_name) .await .expect("create test dir failed"); + op + } + + async fn create_s3_fs(cwd: &Path, config: &AppConfig) -> S3FileSystem { + let opendal_config = extract_s3_config(config); + let op = cleanup_s3_fs(cwd, &opendal_config).await; + + let fs_context = FileSystemContext::default(); + let open_dal_fs = OpenDalFileSystem::new(op, config, &fs_context); - let open_dal_fs = OpenDalFileSystem::new(op, &config, &fs_context); S3FileSystem { open_dal_fs } } - #[tokio::test] - async fn test_s3_file_system() { - if std::env::var("RUN_S3_TESTS").is_err() { - return; + pub(crate) fn s3_test_config() -> AppConfig { + let mut config_file_name = "target/conf/gvfs_fuse_s3.toml"; + let source_file_name = "tests/conf/gvfs_fuse_s3.toml"; + + if !Path::new(config_file_name).exists() { + config_file_name = source_file_name; } + + AppConfig::from_file(Some(config_file_name)).unwrap() + } + + #[tokio::test] + async fn s3_ut_test_s3_file_system() { + test_enable_with!(RUN_TEST_WITH_S3); + + let config = s3_test_config(); let cwd = Path::new("/gvfs_test1"); - let fs = create_s3_fs(cwd).await; + let fs = create_s3_fs(cwd, &config).await; let _ = fs.init().await; let mut tester = TestPathFileSystem::new(cwd, fs); @@ -260,13 +272,12 @@ mod tests { } #[tokio::test] - async fn test_s3_file_system_with_raw_file_system() { - if std::env::var("RUN_S3_TESTS").is_err() { - return; - } + async fn s3_ut_test_s3_file_system_with_raw_file_system() { + test_enable_with!(RUN_TEST_WITH_S3); + let config = s3_test_config(); let cwd = Path::new("/gvfs_test2"); - let s3_fs = create_s3_fs(cwd).await; + let s3_fs = create_s3_fs(cwd, &config).await; let raw_fs = DefaultRawFileSystem::new(s3_fs, &AppConfig::default(), &FileSystemContext::default()); let _ = raw_fs.init().await; diff --git a/clients/filesystem-fuse/tests/bin/env.sh b/clients/filesystem-fuse/tests/bin/env.sh new file mode 100644 index 00000000000..c2e0b23be05 --- /dev/null +++ b/clients/filesystem-fuse/tests/bin/env.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +S3_ACCESS_KEY_ID=${S3_ACCESS_KEY_ID:-test} +S3_SECRET_ACCESS=${S3_SECRET_ACCESS:-test} +S3_REGION=${S3_REGION:-ap-southeast-2} +S3_BUCKET=${S3_BUCKET:-my-bucket} +S3_ENDPOINT=${S3_ENDPOINT:-http://127.0.0.1:4566} + +# Check required environment variables +if [[ -z "$S3_ACCESS_KEY_ID" || -z "$S3_SECRET_ACCESS" || -z "$S3_REGION" || -z "$S3_BUCKET" || -z "$S3_ENDPOINT" ]]; then + echo "Error: One or more required S3 environment variables are not set." + echo "Please set: S3_ACCESS_KEY_ID, S3_SECRET_ACCESS, S3_REGION, S3_BUCKET, S3_ENDPOINT." + exit 1 +fi + +DISABLE_LOCALSTACK=${DISABLE_LOCALSTACK:-0} +# if S3 endpoint is not default value. disable localstack +if [[ "$S3_ENDPOINT" != "http://127.0.0.1:4566" ]]; then + echo "AWS S3 endpoint detected, disabling localstack" + DISABLE_LOCALSTACK=1 +fi + +GRAVITINO_HOME=../../../.. +GRAVITINO_HOME=$(cd $GRAVITINO_HOME && pwd) +GRAVITINO_SERVER_DIR=$GRAVITINO_HOME/distribution/package +CLIENT_FUSE_DIR=$GRAVITINO_HOME/clients/filesystem-fuse + +generate_test_config() { + local config_dir + config_dir=$(dirname "$TEST_CONFIG_FILE") + mkdir -p "$config_dir" + + awk -v access_key="$S3_ACCESS_KEY_ID" \ + -v secret_key="$S3_SECRET_ACCESS" \ + -v region="$S3_REGION" \ + -v bucket="$S3_BUCKET" \ + -v endpoint="$S3_ENDPOINT" \ + 'BEGIN { in_extend_config = 0 } + /^\[extend_config\]/ { in_extend_config = 1 } + in_extend_config && /s3-access_key_id/ { $0 = "s3-access_key_id = \"" access_key "\"" } + in_extend_config && /s3-secret_access_key/ { $0 = "s3-secret_access_key = \"" secret_key "\"" } + in_extend_config && /s3-region/ { $0 = "s3-region = \"" region "\"" } + in_extend_config && /s3-bucket/ { $0 = "s3-bucket = \"" bucket "\"" } + in_extend_config && /s3-endpoint/ { $0 = "s3-endpoint = \"" endpoint "\"" } + { print }' $CLIENT_FUSE_DIR/tests/conf/gvfs_fuse_s3.toml > "$TEST_CONFIG_FILE" +} diff --git a/clients/filesystem-fuse/tests/bin/gravitino_server.sh b/clients/filesystem-fuse/tests/bin/gravitino_server.sh new file mode 100644 index 00000000000..0f9b0fdab98 --- /dev/null +++ b/clients/filesystem-fuse/tests/bin/gravitino_server.sh @@ -0,0 +1,116 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +GRAVITINO_SERVER_URL="http://localhost:8090" + +check_gravitino_server_ready() { + local url=$1 + local retries=10 # Number of retries + local wait_time=1 # Wait time between retries (seconds) + + for ((i=1; i<=retries; i++)); do + if curl --silent --head --fail "$url/api/metalakes" >/dev/null; then + echo "Gravitino server is ready." + return 0 + else + echo "Attempt $i/$retries: Server not ready. Retrying in $wait_time seconds..." + sleep "$wait_time" + fi + done + + echo "Error: Gravitino server did not become ready after $((retries * wait_time)) seconds." + exit 1 +} + +create_resource() { + local url=$1 + local data=$2 + + response=$(curl -s -w "\n%{http_code}" -X POST -H "Accept: application/vnd.gravitino.v1+json" \ + -H "Content-Type: application/json" -d "$data" "$url") + + body=$(echo "$response" | head -n -1) + response_code=$(echo "$response" | tail -n 1) + + # Check if the response code is not 2xx + if [[ "$response_code" -lt 200 || "$response_code" -ge 300 ]]; then + echo "Error: Failed to create resource. Status code: $response_code" + echo "Response body: $body" + exit 1 + fi +} + + + +start_gravitino_server() { + echo "Starting Gravitino Server" + # copy the aws-bundle to the server + if ls $GRAVITINO_SERVER_DIR/catalogs/hadoop/libs/gravitino-aws-bundle-*-incubating-SNAPSHOT.jar 1>/dev/null 2>&1; then + echo "File exists, skipping copy." + else + echo "Copying the aws-bundle to the server" + cp $GRAVITINO_HOME/bundles/aws-bundle/build/libs/gravitino-aws-bundle-*-incubating-SNAPSHOT.jar \ + $GRAVITINO_SERVER_DIR/catalogs/hadoop/libs + fi + + rm -rf $GRAVITINO_SERVER_DIR/data + $GRAVITINO_SERVER_DIR/bin/gravitino.sh restart + + check_gravitino_server_ready $GRAVITINO_SERVER_URL + + # Create metalake + create_resource "$GRAVITINO_SERVER_URL/api/metalakes" '{ + "name":"test", + "comment":"comment", + "properties":{} + }' + + # Create catalog + create_resource "$GRAVITINO_SERVER_URL/api/metalakes/test/catalogs" '{ + "name": "c1", + "type": "FILESET", + "comment": "comment", + "provider": "hadoop", + "properties": { + "location": "s3a://'"$S3_BUCKET"'", + "s3-access-key-id": "'"$S3_ACCESS_KEY_ID"'", + "s3-secret-access-key": "'"$S3_SECRET_ACCESS"'", + "s3-endpoint": "'"$S3_ENDPOINT"'", + "filesystem-providers": "s3" + } + }' + + # Create schema + create_resource "$GRAVITINO_SERVER_URL/api/metalakes/test/catalogs/c1/schemas" '{ + "name":"s1", + "comment":"comment", + "properties":{} + }' + + # Create FILESET + create_resource "$GRAVITINO_SERVER_URL/api/metalakes/test/catalogs/c1/schemas/s1/filesets" '{ + "name":"fileset1", + "comment":"comment", + "properties":{} + }' +} + +stop_gravitino_server() { + $GRAVITINO_SERVER_DIR/bin/gravitino.sh stop + echo "Gravitino Server stopped" +} \ No newline at end of file diff --git a/clients/filesystem-fuse/tests/bin/gvfs_fuse.sh b/clients/filesystem-fuse/tests/bin/gvfs_fuse.sh new file mode 100644 index 00000000000..e706d8e2c0d --- /dev/null +++ b/clients/filesystem-fuse/tests/bin/gvfs_fuse.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +check_gvfs_fuse_ready() { + local retries=10 + local wait_time=1 + + for ((i=1; i<=retries; i++)); do + # check the $MOUNT_DIR/.gvfs_meta is exist + if [ -f "$MOUNT_DIR/.gvfs_meta" ]; then + echo "Gvfs fuse is ready." + return 0 + else + echo "Attempt $i/$retries: Gvfs fuse not ready. Retrying in $wait_time seconds..." + sleep "$wait_time" + fi + done + + echo "Error: Gvfs fuse did not become ready after $((retries * wait_time)) seconds." + tail -n 100 $CLIENT_FUSE_DIR/target/debug/fuse.log + exit 1 +} + +start_gvfs_fuse() { + MOUNT_DIR=$CLIENT_FUSE_DIR/target/gvfs + + umount $MOUNT_DIR > /dev/null 2>&1 || true + if [ ! -d "$MOUNT_DIR" ]; then + echo "Create the mount point" + mkdir -p $MOUNT_DIR + fi + + MOUNT_FROM_LOCATION=gvfs://fileset/test/c1/s1/fileset1 + + # Build the gvfs-fuse + cd $CLIENT_FUSE_DIR + make build + + echo "Starting gvfs-fuse-daemon" + $CLIENT_FUSE_DIR/target/debug/gvfs-fuse $MOUNT_DIR $MOUNT_FROM_LOCATION $TEST_CONFIG_FILE > \ + $CLIENT_FUSE_DIR/target/debug/fuse.log 2>&1 & + check_gvfs_fuse_ready + cd - +} + +stop_gvfs_fuse() { + # Stop the gvfs-fuse process if it's running + pkill -INT gvfs-fuse || true + echo "Stopping gvfs-fuse-daemon" +} \ No newline at end of file diff --git a/clients/filesystem-fuse/tests/bin/localstatck.sh b/clients/filesystem-fuse/tests/bin/localstatck.sh new file mode 100644 index 00000000000..fa4552d48a3 --- /dev/null +++ b/clients/filesystem-fuse/tests/bin/localstatck.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +start_localstack() { +if [ "$DISABLE_LOCALSTACK" -eq 1 ]; then + return +fi + + echo "Starting localstack..." + docker run -d -p 4566:4566 -p 4571:4571 --name localstack localstack/localstack + echo "Localstack started" + + docker exec localstack sh -c "\ + aws configure set aws_access_key_id $S3_ACCESS_KEY_ID && \ + aws configure set aws_secret_access_key $S3_SECRET_ACCESS && \ + aws configure set region $S3_REGION && \ + aws configure set output json" + + docker exec localstack awslocal s3 mb s3://$S3_BUCKET +} + +stop_localstack() { +if [ "$DISABLE_LOCALSTACK" -eq 1 ]; then + return +fi + + echo "Stopping localstack..." + docker stop localstack 2>/dev/null || true + docker rm localstack 2>/dev/null || true + echo "Localstack stopped" +} \ No newline at end of file diff --git a/clients/filesystem-fuse/tests/bin/run_fuse_testers.sh b/clients/filesystem-fuse/tests/bin/run_fuse_testers.sh new file mode 100755 index 00000000000..6dc38c48f07 --- /dev/null +++ b/clients/filesystem-fuse/tests/bin/run_fuse_testers.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +source ./env.sh +source ./gravitino_server.sh +source ./gvfs_fuse.sh +source ./localstatck.sh + +TEST_CONFIG_FILE=$CLIENT_FUSE_DIR/target/debug/gvfs-fuse.toml + +start_servers() { + start_localstack + start_gravitino_server + generate_test_config + start_gvfs_fuse +} + +stop_servers() { + set +e + stop_gvfs_fuse + stop_gravitino_server + stop_localstack +} + +# Main logic based on parameters +if [ "$1" == "test" ]; then + trap stop_servers EXIT + start_servers + # Run the integration test + echo "Running tests..." + cd $CLIENT_FUSE_DIR + export RUN_TEST_WITH_FUSE=1 + cargo test --test fuse_test fuse_it_ + +elif [ "$1" == "start" ]; then + # Start the servers + echo "Starting servers..." + start_servers + +elif [ "$1" == "stop" ]; then + # Stop the servers + echo "Stopping servers..." + stop_servers + +else + echo "Usage: $0 {test|start|stop}" + exit 1 +fi + + diff --git a/clients/filesystem-fuse/tests/bin/run_s3fs_testers.sh b/clients/filesystem-fuse/tests/bin/run_s3fs_testers.sh new file mode 100644 index 00000000000..ac5f9812c93 --- /dev/null +++ b/clients/filesystem-fuse/tests/bin/run_s3fs_testers.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +source ./env.sh +source ./localstatck.sh + +TEST_CONFIG_FILE=$CLIENT_FUSE_DIR/target/conf/gvfs_fuse_s3.toml + +start_servers() { + start_localstack + generate_test_config +} + +stop_servers() { + set +e + stop_localstack +} + +# Main logic based on parameters +if [ "$1" == "test" ]; then + trap stop_servers EXIT + start_servers + # Run the integration test + echo "Running tests..." + cd $CLIENT_FUSE_DIR + export RUN_TEST_WITH_S3=1 + cargo test s3_ut_ --lib + +elif [ "$1" == "start" ]; then + # Start the servers + echo "Starting servers..." + start_servers + +elif [ "$1" == "stop" ]; then + # Stop the servers + echo "Stopping servers..." + stop_servers + +else + echo "Usage: $0 {test|start|stop}" + exit 1 +fi + + diff --git a/clients/filesystem-fuse/tests/conf/gvfs_fuse_s3.toml b/clients/filesystem-fuse/tests/conf/gvfs_fuse_s3.toml index 7d182cd40df..d0ff8e5ddec 100644 --- a/clients/filesystem-fuse/tests/conf/gvfs_fuse_s3.toml +++ b/clients/filesystem-fuse/tests/conf/gvfs_fuse_s3.toml @@ -19,7 +19,7 @@ [fuse] file_mask= 0o600 dir_mask= 0o700 -fs_type = "memory" +fs_type = "gvfs" [fuse.properties] key1 = "value1" @@ -40,4 +40,5 @@ s3-access_key_id = "XXX_access_key" s3-secret_access_key = "XXX_secret_key" s3-region = "XXX_region" s3-bucket = "XXX_bucket" +s3-endpoint = "XXX_endpoint" diff --git a/clients/filesystem-fuse/tests/fuse_test.rs b/clients/filesystem-fuse/tests/fuse_test.rs index d06199d782e..41e385c49f1 100644 --- a/clients/filesystem-fuse/tests/fuse_test.rs +++ b/clients/filesystem-fuse/tests/fuse_test.rs @@ -19,7 +19,8 @@ use fuse3::Errno; use gvfs_fuse::config::AppConfig; -use gvfs_fuse::{gvfs_mount, gvfs_unmount}; +use gvfs_fuse::RUN_TEST_WITH_FUSE; +use gvfs_fuse::{gvfs_mount, gvfs_unmount, test_enable_with}; use log::{error, info}; use std::fs::File; use std::path::Path; @@ -85,7 +86,7 @@ impl Drop for FuseTest { } #[test] -fn test_fuse_system_with_auto() { +fn test_fuse_with_memory_fs() { tracing_subscriber::fmt().init(); panic::set_hook(Box::new(|info| { @@ -106,14 +107,21 @@ fn test_fuse_system_with_auto() { test_fuse_filesystem(mount_point); } -fn test_fuse_system_with_manual() { - test_fuse_filesystem("build/gvfs"); +#[test] +fn fuse_it_test_fuse() { + test_enable_with!(RUN_TEST_WITH_FUSE); + + test_fuse_filesystem("target/gvfs/gvfs_test"); } fn test_fuse_filesystem(mount_point: &str) { info!("Test startup"); let base_path = Path::new(mount_point); + if !file_exists(base_path) { + fs::create_dir_all(base_path).expect("Failed to create test dir"); + } + //test create file let test_file = base_path.join("test_create"); let file = File::create(&test_file).expect("Failed to create file"); @@ -124,12 +132,12 @@ fn test_fuse_filesystem(mount_point: &str) { fs::write(&test_file, "read test").expect("Failed to write file"); //test read file - let content = fs::read_to_string(test_file.clone()).expect("Failed to read file"); + let content = fs::read_to_string(&test_file).expect("Failed to read file"); assert_eq!(content, "read test", "File content mismatch"); //test delete file - fs::remove_file(test_file.clone()).expect("Failed to delete file"); - assert!(!file_exists(test_file)); + fs::remove_file(&test_file).expect("Failed to delete file"); + assert!(!file_exists(&test_file)); //test create directory let test_dir = base_path.join("test_dir"); From 5caa9de4f54f7c2c92156c6a427082eeb28ad49b Mon Sep 17 00:00:00 2001 From: Qi Yu Date: Tue, 14 Jan 2025 18:45:56 +0800 Subject: [PATCH 20/40] [#5472] improvement(docs): Add example to use cloud storage fileset and polish hadoop-catalog document. (#6059) ### What changes were proposed in this pull request? 1. Add full example about how to use cloud storage fileset like S3, GCS, OSS and ADLS 2. Polish how-to-use-gvfs.md and hadoop-catalog-md. 3. Add document how fileset using credential. ### Why are the changes needed? For better user experience. Fix: #5472 ### Does this PR introduce _any_ user-facing change? N/A. ### How was this patch tested? N/A --- .../gravitino/filesystem/gvfs_config.py | 4 +- docs/hadoop-catalog-index.md | 26 + docs/hadoop-catalog-with-adls.md | 522 +++++++++++++++++ docs/hadoop-catalog-with-gcs.md | 500 ++++++++++++++++ docs/hadoop-catalog-with-oss.md | 538 +++++++++++++++++ docs/hadoop-catalog-with-s3.md | 541 ++++++++++++++++++ docs/hadoop-catalog.md | 87 +-- docs/how-to-use-gvfs.md | 173 +----- ...manage-fileset-metadata-using-gravitino.md | 59 +- 9 files changed, 2157 insertions(+), 293 deletions(-) create mode 100644 docs/hadoop-catalog-index.md create mode 100644 docs/hadoop-catalog-with-adls.md create mode 100644 docs/hadoop-catalog-with-gcs.md create mode 100644 docs/hadoop-catalog-with-oss.md create mode 100644 docs/hadoop-catalog-with-s3.md diff --git a/clients/client-python/gravitino/filesystem/gvfs_config.py b/clients/client-python/gravitino/filesystem/gvfs_config.py index 6fbd8a99d18..34db72adee0 100644 --- a/clients/client-python/gravitino/filesystem/gvfs_config.py +++ b/clients/client-python/gravitino/filesystem/gvfs_config.py @@ -42,8 +42,8 @@ class GVFSConfig: GVFS_FILESYSTEM_OSS_SECRET_KEY = "oss_secret_access_key" GVFS_FILESYSTEM_OSS_ENDPOINT = "oss_endpoint" - GVFS_FILESYSTEM_AZURE_ACCOUNT_NAME = "abs_account_name" - GVFS_FILESYSTEM_AZURE_ACCOUNT_KEY = "abs_account_key" + GVFS_FILESYSTEM_AZURE_ACCOUNT_NAME = "azure_storage_account_name" + GVFS_FILESYSTEM_AZURE_ACCOUNT_KEY = "azure_storage_account_key" # This configuration marks the expired time of the credential. For instance, if the credential # fetched from Gravitino server has expired time of 3600 seconds, and the credential_expired_time_ration is 0.5 diff --git a/docs/hadoop-catalog-index.md b/docs/hadoop-catalog-index.md new file mode 100644 index 00000000000..dfa7a187175 --- /dev/null +++ b/docs/hadoop-catalog-index.md @@ -0,0 +1,26 @@ +--- +title: "Hadoop catalog index" +slug: /hadoop-catalog-index +date: 2025-01-13 +keyword: Hadoop catalog index S3 GCS ADLS OSS +license: "This software is licensed under the Apache License version 2." +--- + +### Hadoop catalog overall + +Gravitino Hadoop catalog index includes the following chapters: + +- [Hadoop catalog overview and features](./hadoop-catalog.md): This chapter provides an overview of the Hadoop catalog, its features, capabilities and related configurations. +- [Manage Hadoop catalog with Gravitino API](./manage-fileset-metadata-using-gravitino.md): This chapter explains how to manage fileset metadata using Gravitino API and provides detailed examples. +- [Using Hadoop catalog with Gravitino virtual file system](how-to-use-gvfs.md): This chapter explains how to use Hadoop catalog with the Gravitino virtual file system and provides detailed examples. + +### Hadoop catalog with cloud storage + +Apart from the above, you can also refer to the following topics to manage and access cloud storage like S3, GCS, ADLS, and OSS: + +- [Using Hadoop catalog to manage S3](./hadoop-catalog-with-s3.md). +- [Using Hadoop catalog to manage GCS](./hadoop-catalog-with-gcs.md). +- [Using Hadoop catalog to manage ADLS](./hadoop-catalog-with-adls.md). +- [Using Hadoop catalog to manage OSS](./hadoop-catalog-with-oss.md). + +More storage options will be added soon. Stay tuned! \ No newline at end of file diff --git a/docs/hadoop-catalog-with-adls.md b/docs/hadoop-catalog-with-adls.md new file mode 100644 index 00000000000..96126c6fab9 --- /dev/null +++ b/docs/hadoop-catalog-with-adls.md @@ -0,0 +1,522 @@ +--- +title: "Hadoop catalog with ADLS" +slug: /hadoop-catalog-with-adls +date: 2025-01-03 +keyword: Hadoop catalog ADLS +license: "This software is licensed under the Apache License version 2." +--- + +This document describes how to configure a Hadoop catalog with ADLS (aka. Azure Blob Storage (ABS), or Azure Data Lake Storage (v2)). + +## Prerequisites + +To set up a Hadoop catalog with ADLS, follow these steps: + +1. Download the [`gravitino-azure-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-azure-bundle) file. +2. Place the downloaded file into the Gravitino Hadoop catalog classpath at `${GRAVITINO_HOME}/catalogs/hadoop/libs/`. +3. Start the Gravitino server by running the following command: + +```bash +$ ${GRAVITINO_HOME}/bin/gravitino-server.sh start +``` + +Once the server is up and running, you can proceed to configure the Hadoop catalog with ADLS. In the rest of this document we will use `http://localhost:8090` as the Gravitino server URL, please replace it with your actual server URL. + +## Configurations for creating a Hadoop catalog with ADLS + +### Configuration for a ADLS Hadoop catalog + +Apart from configurations mentioned in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with ADLS: + +| Configuration item | Description | Default value | Required | Since version | +|-------------------------------||-----------------|----------|------------------| +| `filesystem-providers` | The file system providers to add. Set it to `abs` if it's a Azure Blob Storage fileset, or a comma separated string that contains `abs` like `oss,abs,s3` to support multiple kinds of fileset including `abs`. | (none) | Yes | 0.8.0-incubating | +| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for Azure Blob Storage, if we set this value, we can omit the prefix 'abfss://' in the location. | `builtin-local` | No | 0.8.0-incubating | +| `azure-storage-account-name ` | The account name of Azure Blob Storage. | (none) | Yes | 0.8.0-incubating | +| `azure-storage-account-key` | The account key of Azure Blob Storage. | (none) | Yes | 0.8.0-incubating | +| `credential-providers` | The credential provider types, separated by comma, possible value can be `adls-token`, `azure-account-key`. As the default authentication type is using account name and account key as the above, this configuration can enable credential vending provided by Gravitino server and client will no longer need to provide authentication information like account_name/account_key to access ADLS by GVFS. Once it's set, more configuration items are needed to make it works, please see [adls-credential-vending](security/credential-vending.md#adls-credentials) | (none) | No | 0.8.0-incubating | + + +### Configurations for a schema + +Refer to [Schema configurations](./hadoop-catalog.md#schema-properties) for more details. + +### Configurations for a fileset + +Refer to [Fileset configurations](./hadoop-catalog.md#fileset-properties) for more details. + +## Example of creating Hadoop catalog with ADLS + +This section demonstrates how to create the Hadoop catalog with ADLS in Gravitino, with a complete example. + +### Step1: Create a Hadoop catalog with ADLS + +First, you need to create a Hadoop catalog with ADLS. The following example shows how to create a Hadoop catalog with ADLS: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "example_catalog", + "type": "FILESET", + "comment": "This is a ADLS fileset catalog", + "provider": "hadoop", + "properties": { + "location": "abfss://container@account-name.dfs.core.windows.net/path", + "azure-storage-account-name": "The account name of the Azure Blob Storage", + "azure-storage-account-key": "The account key of the Azure Blob Storage", + "filesystem-providers": "abs" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Map adlsProperties = ImmutableMap.builder() + .put("location", "abfss://container@account-name.dfs.core.windows.net/path") + .put("azure-storage-account-name", "azure storage account name") + .put("azure-storage-account-key", "azure storage account key") + .put("filesystem-providers", "abs") + .build(); + +Catalog adlsCatalog = gravitinoClient.createCatalog("example_catalog", + Type.FILESET, + "hadoop", // provider, Gravitino only supports "hadoop" for now. + "This is a ADLS fileset catalog", + adlsProperties); +// ... + +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +adls_properties = { + "location": "abfss://container@account-name.dfs.core.windows.net/path", + "azure-storage-account-name": "azure storage account name", + "azure-storage-account-key": "azure storage account key", + "filesystem-providers": "abs" +} + +adls_properties = gravitino_client.create_catalog(name="example_catalog", + type=Catalog.Type.FILESET, + provider="hadoop", + comment="This is a ADLS fileset catalog", + properties=adls_properties) +``` + + + + +### Step2: Create a schema + +Once the catalog is created, you can create a schema. The following example shows how to create a schema: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "test_schema", + "comment": "This is a ADLS schema", + "properties": { + "location": "abfss://container@account-name.dfs.core.windows.net/path" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas +``` + + + + +```java +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); + +SupportsSchemas supportsSchemas = catalog.asSchemas(); + +Map schemaProperties = ImmutableMap.builder() + .put("location", "abfss://container@account-name.dfs.core.windows.net/path") + .build(); +Schema schema = supportsSchemas.createSchema("test_schema", + "This is a ADLS schema", + schemaProperties +); +// ... +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_schemas().create_schema(name="test_schema", + comment="This is a ADLS schema", + properties={"location": "abfss://container@account-name.dfs.core.windows.net/path"}) +``` + + + + +### Step3: Create a fileset + +After creating the schema, you can create a fileset. The following example shows how to create a fileset: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "example_fileset", + "comment": "This is an example fileset", + "type": "MANAGED", + "storageLocation": "abfss://container@account-name.dfs.core.windows.net/path/example_fileset", + "properties": { + "k1": "v1" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); +FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); + +Map propertiesMap = ImmutableMap.builder() + .put("k1", "v1") + .build(); + +filesetCatalog.createFileset( + NameIdentifier.of("test_schema", "example_fileset"), + "This is an example fileset", + Fileset.Type.MANAGED, + "abfss://container@account-name.dfs.core.windows.net/path/example_fileset", + propertiesMap, +); +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") + +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("test_schema", "example_fileset"), + type=Fileset.Type.MANAGED, + comment="This is an example fileset", + storage_location="abfss://container@account-name.dfs.core.windows.net/path/example_fileset", + properties={"k1": "v1"}) +``` + + + + +## Accessing a fileset with ADLS + +### Using the GVFS Java client to access the fileset + +To access fileset with Azure Blob Storage(ADLS) using the GVFS Java client, based on the [basic GVFS configurations](./how-to-use-gvfs.md#configuration-1), you need to add the following configurations: + +| Configuration item | Description | Default value | Required | Since version | +|------------------------------|-----------------------------------------|---------------|----------|------------------| +| `azure-storage-account-name` | The account name of Azure Blob Storage. | (none) | Yes | 0.8.0-incubating | +| `azure-storage-account-key` | The account key of Azure Blob Storage. | (none) | Yes | 0.8.0-incubating | + +:::note +If the catalog has enabled [credential vending](security/credential-vending.md), the properties above can be omitted. More details can be found in [Fileset with credential vending](#fileset-with-credential-vending). +::: + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri", "http://localhost:8090"); +conf.set("fs.gravitino.client.metalake", "test_metalake"); +conf.set("azure-storage-account-name", "account_name_of_adls"); +conf.set("azure-storage-account-key", "account_key_of_adls"); +Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + +Similar to Spark configurations, you need to add ADLS (bundle) jars to the classpath according to your environment. + +If your wants to custom your hadoop version or there is already a hadoop version in your project, you can add the following dependencies to your `pom.xml`: + +```xml + + org.apache.hadoop + hadoop-common + ${HADOOP_VERSION} + + + + org.apache.hadoop + hadoop-azure + ${HADOOP_VERSION} + + + + org.apache.gravitino + filesystem-hadoop3-runtime + ${GRAVITINO_VERSION} + + + + org.apache.gravitino + gravitino-azure + ${GRAVITINO_VERSION} + +``` + +Or use the bundle jar with Hadoop environment if there is no Hadoop environment: + +```xml + + org.apache.gravitino + gravitino-azure-bundle + ${GRAVITINO_VERSION} + + + + org.apache.gravitino + filesystem-hadoop3-runtime + ${GRAVITINO_VERSION} + +``` + +### Using Spark to access the fileset + +The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: + +Before running the following code, you need to install required packages: + +```bash +pip install pyspark==3.1.3 +pip install apache-gravitino==${GRAVITINO_VERSION} +``` +Then you can run the following code: + +```python +from pyspark.sql import SparkSession +import os + +gravitino_url = "http://localhost:8090" +metalake_name = "test" + +catalog_name = "your_adls_catalog" +schema_name = "your_adls_schema" +fileset_name = "your_adls_fileset" + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-azure-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/hadoop-azure-3.2.0.jar,/path/to/azure-storage-7.0.0.jar,/path/to/wildfly-openssl-1.0.4.Final.jar --master local[1] pyspark-shell" +spark = SparkSession.builder + .appName("adls_fileset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + .config("spark.hadoop.azure-storage-account-name", "azure_account_name") + .config("spark.hadoop.azure-storage-account-key", "azure_account_name") + .config("spark.hadoop.fs.azure.skipUserGroupMetadataDuringInitialization", "true") + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() + +data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] +columns = ["Name", "Age"] +spark_df = spark.createDataFrame(data, schema=columns) +gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" + +spark_df.coalesce(1).write + .mode("overwrite") + .option("header", "true") + .csv(gvfs_path) +``` + +If your Spark **without Hadoop environment**, you can use the following code snippet to access the fileset: + +```python +## Replace the following code snippet with the above code snippet with the same environment variables + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-azure-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar --master local[1] pyspark-shell" +``` + +- [`gravitino-azure-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-azure-bundle) is the Gravitino ADLS jar with Hadoop environment(3.3.1) and `hadoop-azure` jar. +- [`gravitino-azure-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-azure) is a condensed version of the Gravitino ADLS bundle jar without Hadoop environment and `hadoop-azure` jar. +- `hadoop-azure-3.2.0.jar` and `azure-storage-7.0.0.jar` can be found in the Hadoop distribution in the `${HADOOP_HOME}/share/hadoop/tools/lib` directory. + + +Please choose the correct jar according to your environment. + +:::note +In some Spark versions, a Hadoop environment is necessary for the driver, adding the bundle jars with '--jars' may not work. If this is the case, you should add the jars to the spark CLASSPATH directly. +::: + +### Accessing a fileset using the Hadoop fs command + +The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. + +1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: + +```xml + + fs.AbstractFileSystem.gvfs.impl + org.apache.gravitino.filesystem.hadoop.Gvfs + + + + fs.gvfs.impl + org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem + + + + fs.gravitino.server.uri + http://localhost:8090 + + + + fs.gravitino.client.metalake + test + + + + azure-storage-account-name + account_name + + + azure-storage-account-key + account_key + +``` + +2. Add the necessary jars to the Hadoop classpath. + +For ADLS, you need to add `gravitino-filesystem-hadoop3-runtime-${gravitino-version}.jar`, `gravitino-azure-${gravitino-version}.jar` and `hadoop-azure-${hadoop-version}.jar` located at `${HADOOP_HOME}/share/hadoop/tools/lib/` to the Hadoop classpath. + +3. Run the following command to access the fileset: + +```shell +./${HADOOP_HOME}/bin/hadoop dfs -ls gvfs://fileset/adls_catalog/adls_schema/adls_fileset +./${HADOOP_HOME}/bin/hadoop dfs -put /path/to/local/file gvfs://fileset/adls_catalog/adls_schema/adls_fileset +``` + +### Using the GVFS Python client to access a fileset + +In order to access fileset with Azure Blob storage (ADLS) using the GVFS Python client, apart from [basic GVFS configurations](./how-to-use-gvfs.md#configuration-1), you need to add the following configurations: + +| Configuration item | Description | Default value | Required | Since version | +|------------------------------|----------------------------------------|---------------|----------|------------------| +| `azure_storage_account_name` | The account name of Azure Blob Storage | (none) | Yes | 0.8.0-incubating | +| `azure_storage_account_key` | The account key of Azure Blob Storage | (none) | Yes | 0.8.0-incubating | + +:::note +If the catalog has enabled [credential vending](security/credential-vending.md), the properties above can be omitted. +::: + +Please install the `gravitino` package before running the following code: + +```bash +pip install apache-gravitino==${GRAVITINO_VERSION} +``` + +```python +from gravitino import gvfs +options = { + "cache_size": 20, + "cache_expired_time": 3600, + "auth_type": "simple", + "azure_storage_account_name": "azure_account_name", + "azure_storage_account_key": "azure_account_key" +} +fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) +fs.ls("gvfs://fileset/{adls_catalog}/{adls_schema}/{adls_fileset}/") +``` + + +### Using fileset with pandas + +The following are examples of how to use the pandas library to access the ADLS fileset + +```python +import pandas as pd + +storage_options = { + "server_uri": "http://localhost:8090", + "metalake_name": "test", + "options": { + "azure_storage_account_name": "azure_account_name", + "azure_storage_account_key": "azure_account_key" + } +} +ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", + storage_options=storage_options) +ds.head() +``` + +For other use cases, please refer to the [Gravitino Virtual File System](./how-to-use-gvfs.md) document. + +## Fileset with credential vending + +Since 0.8.0-incubating, Gravitino supports credential vending for ADLS fileset. If the catalog has been [configured with credential](./security/credential-vending.md), you can access ADLS fileset without providing authentication information like `azure-storage-account-name` and `azure-storage-account-key` in the properties. + +### How to create an ADLS Hadoop catalog with credential enabled + +Apart from configuration method in [create-adls-hadoop-catalog](#configuration-for-a-adls-hadoop-catalog), properties needed by [adls-credential](./security/credential-vending.md#adls-credentials) should also be set to enable credential vending for ADLS fileset. + +### How to access ADLS fileset with credential + +If the catalog has been configured with credential, you can access ADLS fileset without providing authentication information via GVFS Java/Python client and Spark. Let's see how to access ADLS fileset with credential: + +GVFS Java client: + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri", "http://localhost:8090"); +conf.set("fs.gravitino.client.metalake", "test_metalake"); +// No need to set azure-storage-account-name and azure-storage-account-name +Path filesetPath = new Path("gvfs://fileset/adls_test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + +Spark: + +```python +spark = SparkSession.builder + .appName("adls_fielset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + # No need to set azure-storage-account-name and azure-storage-account-name + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() +``` + +Python client and Hadoop command are similar to the above examples. + diff --git a/docs/hadoop-catalog-with-gcs.md b/docs/hadoop-catalog-with-gcs.md new file mode 100644 index 00000000000..a3eb034b4fe --- /dev/null +++ b/docs/hadoop-catalog-with-gcs.md @@ -0,0 +1,500 @@ +--- +title: "Hadoop catalog with GCS" +slug: /hadoop-catalog-with-gcs +date: 2024-01-03 +keyword: Hadoop catalog GCS +license: "This software is licensed under the Apache License version 2." +--- + +This document describes how to configure a Hadoop catalog with GCS. + +## Prerequisites +To set up a Hadoop catalog with OSS, follow these steps: + +1. Download the [`gravitino-gcp-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp-bundle) file. +2. Place the downloaded file into the Gravitino Hadoop catalog classpath at `${GRAVITINO_HOME}/catalogs/hadoop/libs/`. +3. Start the Gravitino server by running the following command: + +```bash +$ ${GRAVITINO_HOME}/bin/gravitino-server.sh start +``` + +Once the server is up and running, you can proceed to configure the Hadoop catalog with GCS. In the rest of this document we will use `http://localhost:8090` as the Gravitino server URL, please replace it with your actual server URL. + +## Configurations for creating a Hadoop catalog with GCS + +### Configurations for a GCS Hadoop catalog + +Apart from configurations mentioned in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with GCS: + +| Configuration item | Description | Default value | Required | Since version | +|-------------------------------||-----------------|----------|------------------| +| `filesystem-providers` | The file system providers to add. Set it to `gcs` if it's a GCS fileset, a comma separated string that contains `gcs` like `gcs,s3` to support multiple kinds of fileset including `gcs`. | (none) | Yes | 0.7.0-incubating | +| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for GCS, if we set this value, we can omit the prefix 'gs://' in the location. | `builtin-local` | No | 0.7.0-incubating | +| `gcs-service-account-file` | The path of GCS service account JSON file. | (none) | Yes | 0.7.0-incubating | +| `credential-providers` | The credential provider types, separated by comma, possible value can be `gcs-token`. As the default authentication type is using service account as the above, this configuration can enable credential vending provided by Gravitino server and client will no longer need to provide authentication information like service account to access GCS by GVFS. Once it's set, more configuration items are needed to make it works, please see [gcs-credential-vending](security/credential-vending.md#gcs-credentials) | (none) | No | 0.8.0-incubating | + + +### Configurations for a schema + +Refer to [Schema configurations](./hadoop-catalog.md#schema-properties) for more details. + +### Configurations for a fileset + +Refer to [Fileset configurations](./hadoop-catalog.md#fileset-properties) for more details. + +## Example of creating Hadoop catalog with GCS + +This section will show you how to use the Hadoop catalog with GCS in Gravitino, including detailed examples. + +### Create a Hadoop catalog with GCS + +First, you need to create a Hadoop catalog with GCS. The following example shows how to create a Hadoop catalog with GCS: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "test_catalog", + "type": "FILESET", + "comment": "This is a GCS fileset catalog", + "provider": "hadoop", + "properties": { + "location": "gs://bucket/root", + "gcs-service-account-file": "path_of_gcs_service_account_file", + "filesystem-providers": "gcs" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Map gcsProperties = ImmutableMap.builder() + .put("location", "gs://bucket/root") + .put("gcs-service-account-file", "path_of_gcs_service_account_file") + .put("filesystem-providers", "gcs") + .build(); + +Catalog gcsCatalog = gravitinoClient.createCatalog("test_catalog", + Type.FILESET, + "hadoop", // provider, Gravitino only supports "hadoop" for now. + "This is a GCS fileset catalog", + gcsProperties); +// ... + +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +gcs_properties = { + "location": "gs://bucket/root", + "gcs-service-account-file": "path_of_gcs_service_account_file", + "filesystem-providers": "gcs" +} + +gcs_properties = gravitino_client.create_catalog(name="test_catalog", + type=Catalog.Type.FILESET, + provider="hadoop", + comment="This is a GCS fileset catalog", + properties=gcs_properties) +``` + + + + +### Step2: Create a schema + +Once you have created a Hadoop catalog with GCS, you can create a schema. The following example shows how to create a schema: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "test_schema", + "comment": "This is a GCS schema", + "properties": { + "location": "gs://bucket/root/schema" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas +``` + + + + +```java +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); + +SupportsSchemas supportsSchemas = catalog.asSchemas(); + +Map schemaProperties = ImmutableMap.builder() + .put("location", "gs://bucket/root/schema") + .build(); +Schema schema = supportsSchemas.createSchema("test_schema", + "This is a GCS schema", + schemaProperties +); +// ... +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_schemas().create_schema(name="test_schema", + comment="This is a GCS schema", + properties={"location": "gs://bucket/root/schema"}) +``` + + + + + +### Step3: Create a fileset + +After creating a schema, you can create a fileset. The following example shows how to create a fileset: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "example_fileset", + "comment": "This is an example fileset", + "type": "MANAGED", + "storageLocation": "gs://bucket/root/schema/example_fileset", + "properties": { + "k1": "v1" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); +FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); + +Map propertiesMap = ImmutableMap.builder() + .put("k1", "v1") + .build(); + +filesetCatalog.createFileset( + NameIdentifier.of("test_schema", "example_fileset"), + "This is an example fileset", + Fileset.Type.MANAGED, + "gs://bucket/root/schema/example_fileset", + propertiesMap, +); +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") + +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("test_schema", "example_fileset"), + type=Fileset.Type.MANAGED, + comment="This is an example fileset", + storage_location="gs://bucket/root/schema/example_fileset", + properties={"k1": "v1"}) +``` + + + + +## Accessing a fileset with GCS + +### Using the GVFS Java client to access the fileset + +To access fileset with GCS using the GVFS Java client, based on the [basic GVFS configurations](./how-to-use-gvfs.md#configuration-1), you need to add the following configurations: + +| Configuration item | Description | Default value | Required | Since version | +|----------------------------|--------------------------------------------|---------------|----------|------------------| +| `gcs-service-account-file` | The path of GCS service account JSON file. | (none) | Yes | 0.7.0-incubating | + +:::note +If the catalog has enabled [credential vending](security/credential-vending.md), the properties above can be omitted. More details can be found in [Fileset with credential vending](#fileset-with-credential-vending). +::: + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri", "http://localhost:8090"); +conf.set("fs.gravitino.client.metalake", "test_metalake"); +conf.set("gcs-service-account-file", "/path/your-service-account-file.json"); +Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + +Similar to Spark configurations, you need to add GCS (bundle) jars to the classpath according to your environment. +If your wants to custom your hadoop version or there is already a hadoop version in your project, you can add the following dependencies to your `pom.xml`: + +```xml + + org.apache.hadoop + hadoop-common + ${HADOOP_VERSION} + + + com.google.cloud.bigdataoss + gcs-connector + ${GCS_CONNECTOR_VERSION} + + + org.apache.gravitino + filesystem-hadoop3-runtime + ${GRAVITINO_VERSION} + + + + org.apache.gravitino + gravitino-gcp + ${GRAVITINO_VERSION} + +``` + +Or use the bundle jar with Hadoop environment if there is no Hadoop environment: + +```xml + + org.apache.gravitino + gravitino-gcp-bundle + ${GRAVITINO_VERSION} + + + + org.apache.gravitino + filesystem-hadoop3-runtime + ${GRAVITINO_VERSION} + +``` + +### Using Spark to access the fileset + +The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: + +Before running the following code, you need to install required packages: + +```bash +pip install pyspark==3.1.3 +pip install apache-gravitino==${GRAVITINO_VERSION} +``` +Then you can run the following code: + +```python +from pyspark.sql import SparkSession +import os + +gravitino_url = "http://localhost:8090" +metalake_name = "test" + +catalog_name = "your_gcs_catalog" +schema_name = "your_gcs_schema" +fileset_name = "your_gcs_fileset" + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-gcp-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/gcs-connector-hadoop3-2.2.22-shaded.jar --master local[1] pyspark-shell" +spark = SparkSession.builder + .appName("gcs_fielset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test_metalake") + .config("spark.hadoop.gcs-service-account-file", "/path/to/gcs-service-account-file.json") + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() + +data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] +columns = ["Name", "Age"] +spark_df = spark.createDataFrame(data, schema=columns) +gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" + +spark_df.coalesce(1).write + .mode("overwrite") + .option("header", "true") + .csv(gvfs_path) +``` + +If your Spark **without Hadoop environment**, you can use the following code snippet to access the fileset: + +```python +## Replace the following code snippet with the above code snippet with the same environment variables + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-gcp-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar, --master local[1] pyspark-shell" +``` + +- [`gravitino-gcp-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp-bundle) is the Gravitino GCP jar with Hadoop environment(3.3.1) and `gcs-connector`. +- [`gravitino-gcp-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp) is a condensed version of the Gravitino GCP bundle jar without Hadoop environment and [`gcs-connector`](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases/download/v2.2.22/gcs-connector-hadoop3-2.2.22-shaded.jar) + +Please choose the correct jar according to your environment. + +:::note +In some Spark versions, a Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work. If this is the case, you should add the jars to the spark CLASSPATH directly. +::: + +### Accessing a fileset using the Hadoop fs command + +The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. + +1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: + +```xml + + fs.AbstractFileSystem.gvfs.impl + org.apache.gravitino.filesystem.hadoop.Gvfs + + + + fs.gvfs.impl + org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem + + + + fs.gravitino.server.uri + http://localhost:8090 + + + + fs.gravitino.client.metalake + test + + + + gcs-service-account-file + /path/your-service-account-file.json + +``` + +2. Add the necessary jars to the Hadoop classpath. + +For GCS, you need to add `gravitino-filesystem-hadoop3-runtime-${gravitino-version}.jar`, `gravitino-gcp-${gravitino-version}.jar` and [`gcs-connector-hadoop3-2.2.22-shaded.jar`](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases/download/v2.2.22/gcs-connector-hadoop3-2.2.22-shaded.jar) to Hadoop classpath. + +3. Run the following command to access the fileset: + +```shell +./${HADOOP_HOME}/bin/hadoop dfs -ls gvfs://fileset/gcs_catalog/gcs_schema/gcs_example +./${HADOOP_HOME}/bin/hadoop dfs -put /path/to/local/file gvfs://fileset/gcs_catalog/gcs_schema/gcs_example +``` + +### Using the GVFS Python client to access a fileset + +In order to access fileset with GCS using the GVFS Python client, apart from [basic GVFS configurations](./how-to-use-gvfs.md#configuration-1), you need to add the following configurations: + +| Configuration item | Description | Default value | Required | Since version | +|----------------------------|-------------------------------------------|---------------|----------|------------------| +| `gcs_service_account_file` | The path of GCS service account JSON file.| (none) | Yes | 0.7.0-incubating | + +:::note +If the catalog has enabled [credential vending](security/credential-vending.md), the properties above can be omitted. +::: + +Please install the `gravitino` package before running the following code: + +```bash +pip install apache-gravitino==${GRAVITINO_VERSION} +``` + +```python +from gravitino import gvfs +options = { + "cache_size": 20, + "cache_expired_time": 3600, + "auth_type": "simple", + "gcs_service_account_file": "path_of_gcs_service_account_file.json", +} +fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) +fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") +``` + +### Using fileset with pandas + +The following are examples of how to use the pandas library to access the GCS fileset + +```python +import pandas as pd + +storage_options = { + "server_uri": "http://localhost:8090", + "metalake_name": "test", + "options": { + "gcs_service_account_file": "path_of_gcs_service_account_file.json", + } +} +ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", + storage_options=storage_options) +ds.head() +``` + +For other use cases, please refer to the [Gravitino Virtual File System](./how-to-use-gvfs.md) document. + +## Fileset with credential vending + +Since 0.8.0-incubating, Gravitino supports credential vending for GCS fileset. If the catalog has been [configured with credential](./security/credential-vending.md), you can access GCS fileset without providing authentication information like `gcs-service-account-file` in the properties. + +### How to create a GCS Hadoop catalog with credential enabled + +Apart from configuration method in [create-gcs-hadoop-catalog](#configurations-for-a-gcs-hadoop-catalog), properties needed by [gcs-credential](./security/credential-vending.md#gcs-credentials) should also be set to enable credential vending for GCS fileset. + +### How to access GCS fileset with credential + +If the catalog has been configured with credential, you can access GCS fileset without providing authentication information via GVFS Java/Python client and Spark. Let's see how to access GCS fileset with credential: + +GVFS Java client: + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri", "http://localhost:8090"); +conf.set("fs.gravitino.client.metalake", "test_metalake"); +// No need to set gcs-service-account-file +Path filesetPath = new Path("gvfs://fileset/gcs_test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + +Spark: + +```python +spark = SparkSession.builder + .appName("gcs_fileset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + # No need to set gcs-service-account-file + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() +``` + +Python client and Hadoop command are similar to the above examples. diff --git a/docs/hadoop-catalog-with-oss.md b/docs/hadoop-catalog-with-oss.md new file mode 100644 index 00000000000..e63935c720a --- /dev/null +++ b/docs/hadoop-catalog-with-oss.md @@ -0,0 +1,538 @@ +--- +title: "Hadoop catalog with OSS" +slug: /hadoop-catalog-with-oss +date: 2025-01-03 +keyword: Hadoop catalog OSS +license: "This software is licensed under the Apache License version 2." +--- + +This document explains how to configure a Hadoop catalog with Aliyun OSS (Object Storage Service) in Gravitino. + +## Prerequisites + +To set up a Hadoop catalog with OSS, follow these steps: + +1. Download the [`gravitino-aliyun-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aliyun-bundle) file. +2. Place the downloaded file into the Gravitino Hadoop catalog classpath at `${GRAVITINO_HOME}/catalogs/hadoop/libs/`. +3. Start the Gravitino server by running the following command: + +```bash +$ ${GRAVITINO_HOME}/bin/gravitino-server.sh start +``` + +Once the server is up and running, you can proceed to configure the Hadoop catalog with OSS. In the rest of this document we will use `http://localhost:8090` as the Gravitino server URL, please replace it with your actual server URL. + +## Configurations for creating a Hadoop catalog with OSS + +### Configuration for an OSS Hadoop catalog + +In addition to the basic configurations mentioned in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with OSS: + +| Configuration item | Description | Default value | Required | Since version | +|--------------------------------||-----------------|----------|------------------| +| `filesystem-providers` | The file system providers to add. Set it to `oss` if it's a OSS fileset, or a comma separated string that contains `oss` like `oss,gs,s3` to support multiple kinds of fileset including `oss`. | (none) | Yes | 0.7.0-incubating | +| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for OSS, if we set this value, we can omit the prefix 'oss://' in the location. | `builtin-local` | No | 0.7.0-incubating | +| `oss-endpoint` | The endpoint of the Aliyun OSS. | (none) | Yes | 0.7.0-incubating | +| `oss-access-key-id` | The access key of the Aliyun OSS. | (none) | Yes | 0.7.0-incubating | +| `oss-secret-access-key` | The secret key of the Aliyun OSS. | (none) | Yes | 0.7.0-incubating | +| `credential-providers` | The credential provider types, separated by comma, possible value can be `oss-token`, `oss-secret-key`. As the default authentication type is using AKSK as the above, this configuration can enable credential vending provided by Gravitino server and client will no longer need to provide authentication information like AKSK to access OSS by GVFS. Once it's set, more configuration items are needed to make it works, please see [oss-credential-vending](security/credential-vending.md#oss-credentials) | (none) | No | 0.8.0-incubating | + + +### Configurations for a schema + +To create a schema, refer to [Schema configurations](./hadoop-catalog.md#schema-properties). + +### Configurations for a fileset + +For instructions on how to create a fileset, refer to [Fileset configurations](./hadoop-catalog.md#fileset-properties) for more details. + +## Example of creating Hadoop catalog/schema/fileset with OSS + +This section will show you how to use the Hadoop catalog with OSS in Gravitino, including detailed examples. + +### Step1: Create a Hadoop catalog with OSS + +First, you need to create a Hadoop catalog for OSS. The following examples demonstrate how to create a Hadoop catalog with OSS: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "test_catalog", + "type": "FILESET", + "comment": "This is a OSS fileset catalog", + "provider": "hadoop", + "properties": { + "location": "oss://bucket/root", + "oss-access-key-id": "access_key", + "oss-secret-access-key": "secret_key", + "oss-endpoint": "http://oss-cn-hangzhou.aliyuncs.com", + "filesystem-providers": "oss" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Map ossProperties = ImmutableMap.builder() + .put("location", "oss://bucket/root") + .put("oss-access-key-id", "access_key") + .put("oss-secret-access-key", "secret_key") + .put("oss-endpoint", "http://oss-cn-hangzhou.aliyuncs.com") + .put("filesystem-providers", "oss") + .build(); + +Catalog ossCatalog = gravitinoClient.createCatalog("test_catalog", + Type.FILESET, + "hadoop", // provider, Gravitino only supports "hadoop" for now. + "This is a OSS fileset catalog", + ossProperties); +// ... + +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +oss_properties = { + "location": "oss://bucket/root", + "oss-access-key-id": "access_key" + "oss-secret-access-key": "secret_key", + "oss-endpoint": "ossProperties", + "filesystem-providers": "oss" +} + +oss_catalog = gravitino_client.create_catalog(name="test_catalog", + type=Catalog.Type.FILESET, + provider="hadoop", + comment="This is a OSS fileset catalog", + properties=oss_properties) +``` + + + + +Step 2: Create a Schema + +Once the Hadoop catalog with OSS is created, you can create a schema inside that catalog. Below are examples of how to do this: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "test_schema", + "comment": "This is a OSS schema", + "properties": { + "location": "oss://bucket/root/schema" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas +``` + + + + +```java +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); + +SupportsSchemas supportsSchemas = catalog.asSchemas(); + +Map schemaProperties = ImmutableMap.builder() + .put("location", "oss://bucket/root/schema") + .build(); +Schema schema = supportsSchemas.createSchema("test_schema", + "This is a OSS schema", + schemaProperties +); +// ... +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_schemas().create_schema(name="test_schema", + comment="This is a OSS schema", + properties={"location": "oss://bucket/root/schema"}) +``` + + + + + +### Create a fileset + +Now that the schema is created, you can create a fileset inside it. Here’s how: + + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "example_fileset", + "comment": "This is an example fileset", + "type": "MANAGED", + "storageLocation": "oss://bucket/root/schema/example_fileset", + "properties": { + "k1": "v1" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); +FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); + +Map propertiesMap = ImmutableMap.builder() + .put("k1", "v1") + .build(); + +filesetCatalog.createFileset( + NameIdentifier.of("test_schema", "example_fileset"), + "This is an example fileset", + Fileset.Type.MANAGED, + "oss://bucket/root/schema/example_fileset", + propertiesMap, +); +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") + +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("test_schema", "example_fileset"), + type=Fileset.Type.MANAGED, + comment="This is an example fileset", + storage_location="oss://bucket/root/schema/example_fileset", + properties={"k1": "v1"}) +``` + + + + +## Accessing a fileset with OSS + +### Using the GVFS Java client to access the fileset + +To access fileset with OSS using the GVFS Java client, based on the [basic GVFS configurations](./how-to-use-gvfs.md#configuration-1), you need to add the following configurations: + +| Configuration item | Description | Default value | Required | Since version | +|-------------------------|-----------------------------------|---------------|----------|------------------| +| `oss-endpoint` | The endpoint of the Aliyun OSS. | (none) | Yes | 0.7.0-incubating | +| `oss-access-key-id` | The access key of the Aliyun OSS. | (none) | Yes | 0.7.0-incubating | +| `oss-secret-access-key` | The secret key of the Aliyun OSS. | (none) | Yes | 0.7.0-incubating | + +:::note +If the catalog has enabled [credential vending](security/credential-vending.md), the properties above can be omitted. More details can be found in [Fileset with credential vending](#fileset-with-credential-vending). +::: + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri", "http://localhost:8090"); +conf.set("fs.gravitino.client.metalake", "test_metalake"); +conf.set("oss-endpoint", "http://localhost:8090"); +conf.set("oss-access-key-id", "minio"); +conf.set("oss-secret-access-key", "minio123"); +Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + +Similar to Spark configurations, you need to add OSS (bundle) jars to the classpath according to your environment. +If your wants to custom your hadoop version or there is already a hadoop version in your project, you can add the following dependencies to your `pom.xml`: + +```xml + + org.apache.hadoop + hadoop-common + ${HADOOP_VERSION} + + + + org.apache.hadoop + hadoop-aliyun + ${HADOOP_VERSION} + + + + org.apache.gravitino + filesystem-hadoop3-runtime + ${GRAVITINO_VERSION} + + + + org.apache.gravitino + gravitino-aliyun + ${GRAVITINO_VERSION} + +``` + +Or use the bundle jar with Hadoop environment if there is no Hadoop environment: + +```xml + + org.apache.gravitino + gravitino-aliyun-bundle + ${GRAVITINO_VERSION} + + + + org.apache.gravitino + filesystem-hadoop3-runtime + ${GRAVITINO_VERSION} + +``` + +### Using Spark to access the fileset + +The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: + +Before running the following code, you need to install required packages: + +```bash +pip install pyspark==3.1.3 +pip install apache-gravitino==${GRAVITINO_VERSION} +``` +Then you can run the following code: + +```python +from pyspark.sql import SparkSession +import os + +gravitino_url = "http://localhost:8090" +metalake_name = "test" + +catalog_name = "your_oss_catalog" +schema_name = "your_oss_schema" +fileset_name = "your_oss_fileset" + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aliyun-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/aliyun-sdk-oss-2.8.3.jar,/path/to/hadoop-aliyun-3.2.0.jar,/path/to/jdom-1.1.jar --master local[1] pyspark-shell" +spark = SparkSession.builder + .appName("oss_fileset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "${_URL}") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + .config("spark.hadoop.oss-access-key-id", os.environ["OSS_ACCESS_KEY_ID"]) + .config("spark.hadoop.oss-secret-access-key", os.environ["OSS_SECRET_ACCESS_KEY"]) + .config("spark.hadoop.oss-endpoint", "http://oss-cn-hangzhou.aliyuncs.com") + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() + +data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] +columns = ["Name", "Age"] +spark_df = spark.createDataFrame(data, schema=columns) +gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" + +spark_df.coalesce(1).write + .mode("overwrite") + .option("header", "true") + .csv(gvfs_path) +``` + +If your Spark **without Hadoop environment**, you can use the following code snippet to access the fileset: + +```python +## Replace the following code snippet with the above code snippet with the same environment variables + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aliyun-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar, --master local[1] pyspark-shell" +``` + +- [`gravitino-aliyun-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aliyun-bundle) is the Gravitino Aliyun jar with Hadoop environment(3.3.1) and `hadoop-oss` jar. +- [`gravitino-aliyun-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aliyun) is a condensed version of the Gravitino Aliyun bundle jar without Hadoop environment and `hadoop-aliyun` jar. +-`hadoop-aliyun-3.2.0.jar` and `aliyun-sdk-oss-2.8.3.jar` can be found in the Hadoop distribution in the `${HADOOP_HOME}/share/hadoop/tools/lib` directory. + +Please choose the correct jar according to your environment. + +:::note +In some Spark versions, a Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work. If this is the case, you should add the jars to the spark CLASSPATH directly. +::: + +### Accessing a fileset using the Hadoop fs command + +The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. + +1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: + +```xml + + fs.AbstractFileSystem.gvfs.impl + org.apache.gravitino.filesystem.hadoop.Gvfs + + + + fs.gvfs.impl + org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem + + + + fs.gravitino.server.uri + http://localhost:8090 + + + + fs.gravitino.client.metalake + test + + + + oss-endpoint + http://oss-cn-hangzhou.aliyuncs.com + + + + oss-access-key-id + access-key + + + + oss-secret-access-key + secret-key + +``` + +2. Add the necessary jars to the Hadoop classpath. + +For OSS, you need to add `gravitino-filesystem-hadoop3-runtime-${gravitino-version}.jar`, `gravitino-aliyun-${gravitino-version}.jar` and `hadoop-aliyun-${hadoop-version}.jar` located at `${HADOOP_HOME}/share/hadoop/tools/lib/` to Hadoop classpath. + +3. Run the following command to access the fileset: + +```shell +./${HADOOP_HOME}/bin/hadoop dfs -ls gvfs://fileset/oss_catalog/oss_schema/oss_fileset +./${HADOOP_HOME}/bin/hadoop dfs -put /path/to/local/file gvfs://fileset/oss_catalog/schema/oss_fileset +``` + +### Using the GVFS Python client to access a fileset + +In order to access fileset with OSS using the GVFS Python client, apart from [basic GVFS configurations](./how-to-use-gvfs.md#configuration-1), you need to add the following configurations: + +| Configuration item | Description | Default value | Required | Since version | +|-------------------------|-----------------------------------|---------------|----------|------------------| +| `oss_endpoint` | The endpoint of the Aliyun OSS. | (none) | Yes | 0.7.0-incubating | +| `oss_access_key_id` | The access key of the Aliyun OSS. | (none) | Yes | 0.7.0-incubating | +| `oss_secret_access_key` | The secret key of the Aliyun OSS. | (none) | Yes | 0.7.0-incubating | + +:::note +If the catalog has enabled [credential vending](security/credential-vending.md), the properties above can be omitted. +::: + +Please install the `gravitino` package before running the following code: + +```bash +pip install apache-gravitino==${GRAVITINO_VERSION} +``` + +```python +from gravitino import gvfs +options = { + "cache_size": 20, + "cache_expired_time": 3600, + "auth_type": "simple", + "oss_endpoint": "http://localhost:8090", + "oss_access_key_id": "minio", + "oss_secret_access_key": "minio123" +} +fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) + +fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") +``` + + +### Using fileset with pandas + +The following are examples of how to use the pandas library to access the OSS fileset + +```python +import pandas as pd + +storage_options = { + "server_uri": "http://localhost:8090", + "metalake_name": "test", + "options": { + "oss_access_key_id": "access_key", + "oss_secret_access_key": "secret_key", + "oss_endpoint": "http://oss-cn-hangzhou.aliyuncs.com" + } +} +ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", + storage_options=storage_options) +ds.head() +``` +For other use cases, please refer to the [Gravitino Virtual File System](./how-to-use-gvfs.md) document. + +## Fileset with credential vending + +Since 0.8.0-incubating, Gravitino supports credential vending for OSS fileset. If the catalog has been [configured with credential](./security/credential-vending.md), you can access OSS fileset without providing authentication information like `oss-access-key-id` and `oss-secret-access-key` in the properties. + +### How to create a OSS Hadoop catalog with credential enabled + +Apart from configuration method in [create-oss-hadoop-catalog](#configuration-for-an-oss-hadoop-catalog), properties needed by [oss-credential](./security/credential-vending.md#oss-credentials) should also be set to enable credential vending for OSS fileset. + +### How to access OSS fileset with credential + +If the catalog has been configured with credential, you can access OSS fileset without providing authentication information via GVFS Java/Python client and Spark. Let's see how to access OSS fileset with credential: + +GVFS Java client: + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri", "http://localhost:8090"); +conf.set("fs.gravitino.client.metalake", "test_metalake"); +// No need to set oss-access-key-id and oss-secret-access-key +Path filesetPath = new Path("gvfs://fileset/oss_test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + +Spark: + +```python +spark = SparkSession.builder + .appName("oss_fileset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + # No need to set oss-access-key-id and oss-secret-access-key + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() +``` + +Python client and Hadoop command are similar to the above examples. + + diff --git a/docs/hadoop-catalog-with-s3.md b/docs/hadoop-catalog-with-s3.md new file mode 100644 index 00000000000..7d56f2b9ab8 --- /dev/null +++ b/docs/hadoop-catalog-with-s3.md @@ -0,0 +1,541 @@ +--- +title: "Hadoop catalog with S3" +slug: /hadoop-catalog-with-s3 +date: 2025-01-03 +keyword: Hadoop catalog S3 +license: "This software is licensed under the Apache License version 2." +--- + +This document explains how to configure a Hadoop catalog with S3 in Gravitino. + +## Prerequisites + +To create a Hadoop catalog with S3, follow these steps: + +1. Download the [`gravitino-aws-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aws-bundle) file. +2. Place this file in the Gravitino Hadoop catalog classpath at `${GRAVITINO_HOME}/catalogs/hadoop/libs/`. +3. Start the Gravitino server using the following command: + +```bash +$ ${GRAVITINO_HOME}/bin/gravitino-server.sh start +``` + +Once the server is up and running, you can proceed to configure the Hadoop catalog with S3. In the rest of this document we will use `http://localhost:8090` as the Gravitino server URL, please replace it with your actual server URL. + +## Configurations for creating a Hadoop catalog with S3 + +### Configurations for S3 Hadoop Catalog + +In addition to the basic configurations mentioned in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are necessary to configure a Hadoop catalog with S3: + +| Configuration item | Description | Default value | Required | Since version | +|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|----------|------------------| +| `filesystem-providers` | The file system providers to add. Set it to `s3` if it's a S3 fileset, or a comma separated string that contains `s3` like `gs,s3` to support multiple kinds of fileset including `s3`. | (none) | Yes | 0.7.0-incubating | +| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for S3, if we set this value, we can omit the prefix 's3a://' in the location. | `builtin-local` | No | 0.7.0-incubating | +| `s3-endpoint` | The endpoint of the AWS S3. This configuration is optional for S3 service, but required for other S3-compatible storage services like MinIO. | (none) | No | 0.7.0-incubating | +| `s3-access-key-id` | The access key of the AWS S3. | (none) | Yes | 0.7.0-incubating | +| `s3-secret-access-key` | The secret key of the AWS S3. | (none) | Yes | 0.7.0-incubating | +| `credential-providers` | The credential provider types, separated by comma, possible value can be `s3-token`, `s3-secret-key`. As the default authentication type is using AKSK as the above, this configuration can enable credential vending provided by Gravitino server and client will no longer need to provide authentication information like AKSK to access S3 by GVFS. Once it's set, more configuration items are needed to make it works, please see [s3-credential-vending](security/credential-vending.md#s3-credentials) | (none) | No | 0.8.0-incubating | + +### Configurations for a schema + +To learn how to create a schema, refer to [Schema configurations](./hadoop-catalog.md#schema-properties). + +### Configurations for a fileset + +For more details on creating a fileset, Refer to [Fileset configurations](./hadoop-catalog.md#fileset-properties). + + +## Using the Hadoop catalog with S3 + +This section demonstrates how to use the Hadoop catalog with S3 in Gravitino, with a complete example. + +### Step1: Create a Hadoop Catalog with S3 + +First of all, you need to create a Hadoop catalog with S3. The following example shows how to create a Hadoop catalog with S3: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "test_catalog", + "type": "FILESET", + "comment": "This is a S3 fileset catalog", + "provider": "hadoop", + "properties": { + "location": "s3a://bucket/root", + "s3-access-key-id": "access_key", + "s3-secret-access-key": "secret_key", + "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com", + "filesystem-providers": "s3" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Map s3Properties = ImmutableMap.builder() + .put("location", "s3a://bucket/root") + .put("s3-access-key-id", "access_key") + .put("s3-secret-access-key", "secret_key") + .put("s3-endpoint", "http://s3.ap-northeast-1.amazonaws.com") + .put("filesystem-providers", "s3") + .build(); + +Catalog s3Catalog = gravitinoClient.createCatalog("test_catalog", + Type.FILESET, + "hadoop", // provider, Gravitino only supports "hadoop" for now. + "This is a S3 fileset catalog", + s3Properties); +// ... + +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +s3_properties = { + "location": "s3a://bucket/root", + "s3-access-key-id": "access_key" + "s3-secret-access-key": "secret_key", + "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com", + "filesystem-providers": "s3" +} + +s3_catalog = gravitino_client.create_catalog(name="test_catalog", + type=Catalog.Type.FILESET, + provider="hadoop", + comment="This is a S3 fileset catalog", + properties=s3_properties) +``` + + + + +:::note +When using S3 with Hadoop, ensure that the location value starts with s3a:// (not s3://) for AWS S3. For example, use s3a://bucket/root, as the s3:// format is not supported by the hadoop-aws library. +::: + +### Step2: Create a schema + +Once your Hadoop catalog with S3 is created, you can create a schema under the catalog. Here are examples of how to do that: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "test_schema", + "comment": "This is a S3 schema", + "properties": { + "location": "s3a://bucket/root/schema" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas +``` + + + + +```java +Catalog catalog = gravitinoClient.loadCatalog("hive_catalog"); + +SupportsSchemas supportsSchemas = catalog.asSchemas(); + +Map schemaProperties = ImmutableMap.builder() + .put("location", "s3a://bucket/root/schema") + .build(); +Schema schema = supportsSchemas.createSchema("test_schema", + "This is a S3 schema", + schemaProperties +); +// ... +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_schemas().create_schema(name="test_schema", + comment="This is a S3 schema", + properties={"location": "s3a://bucket/root/schema"}) +``` + + + + +### Step3: Create a fileset + +After creating the schema, you can create a fileset. Here are examples for creating a fileset: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "example_fileset", + "comment": "This is an example fileset", + "type": "MANAGED", + "storageLocation": "s3a://bucket/root/schema/example_fileset", + "properties": { + "k1": "v1" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); +FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); + +Map propertiesMap = ImmutableMap.builder() + .put("k1", "v1") + .build(); + +filesetCatalog.createFileset( + NameIdentifier.of("test_schema", "example_fileset"), + "This is an example fileset", + Fileset.Type.MANAGED, + "s3a://bucket/root/schema/example_fileset", + propertiesMap, +); +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") + +catalog: Catalog = gravitino_client.load_catalog(name="catalog") +catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "example_fileset"), + type=Fileset.Type.MANAGED, + comment="This is an example fileset", + storage_location="s3a://bucket/root/schema/example_fileset", + properties={"k1": "v1"}) +``` + + + + +## Accessing a fileset with S3 + +### Using the GVFS Java client to access the fileset + +To access fileset with S3 using the GVFS Java client, based on the [basic GVFS configurations](./how-to-use-gvfs.md#configuration-1), you need to add the following configurations: + +| Configuration item | Description | Default value | Required | Since version | +|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|---------------|----------|------------------| +| `s3-endpoint` | The endpoint of the AWS S3. This configuration is optional for S3 service, but required for other S3-compatible storage services like MinIO. | (none) | No | 0.7.0-incubating | +| `s3-access-key-id` | The access key of the AWS S3. | (none) | Yes | 0.7.0-incubating | +| `s3-secret-access-key` | The secret key of the AWS S3. | (none) | Yes | 0.7.0-incubating | + +:::note +- `s3-endpoint` is an optional configuration for AWS S3, however, it is required for other S3-compatible storage services like MinIO. +- If the catalog has enabled [credential vending](security/credential-vending.md), the properties above can be omitted. More details can be found in [Fileset with credential vending](#fileset-with-credential-vending). +::: + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri", "http://localhost:8090"); +conf.set("fs.gravitino.client.metalake", "test_metalake"); +conf.set("s3-endpoint", "http://localhost:8090"); +conf.set("s3-access-key-id", "minio"); +conf.set("s3-secret-access-key", "minio123"); + +Path filesetPath = new Path("gvfs://fileset/adls_catalog/adls_schema/adls_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + +Similar to Spark configurations, you need to add S3 (bundle) jars to the classpath according to your environment. + +```xml + + org.apache.hadoop + hadoop-common + ${HADOOP_VERSION} + + + + org.apache.hadoop + hadoop-aws + ${HADOOP_VERSION} + + + + org.apache.gravitino + filesystem-hadoop3-runtime + ${GRAVITINO_VERSION} + + + + org.apache.gravitino + gravitino-aws + ${GRAVITINO_VERSION} + +``` + +Or use the bundle jar with Hadoop environment if there is no Hadoop environment: + + +```xml + + org.apache.gravitino + gravitino-aws-bundle + ${GRAVITINO_VERSION} + + + + org.apache.gravitino + filesystem-hadoop3-runtime + ${GRAVITINO_VERSION} + +``` + +### Using Spark to access the fileset + +The following Python code demonstrates how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: + +Before running the following code, you need to install required packages: + +```bash +pip install pyspark==3.1.3 +pip install apache-gravitino==${GRAVITINO_VERSION} +``` +Then you can run the following code: + +```python +from pyspark.sql import SparkSession +import os + +gravitino_url = "http://localhost:8090" +metalake_name = "test" + +catalog_name = "your_s3_catalog" +schema_name = "your_s3_schema" +fileset_name = "your_s3_fileset" + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aws-${gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-${gravitino-version}-SNAPSHOT.jar,/path/to/hadoop-aws-3.2.0.jar,/path/to/aws-java-sdk-bundle-1.11.375.jar --master local[1] pyspark-shell" +spark = SparkSession.builder + .appName("s3_fielset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + .config("spark.hadoop.s3-access-key-id", os.environ["S3_ACCESS_KEY_ID"]) + .config("spark.hadoop.s3-secret-access-key", os.environ["S3_SECRET_ACCESS_KEY"]) + .config("spark.hadoop.s3-endpoint", "http://s3.ap-northeast-1.amazonaws.com") + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() + +data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] +columns = ["Name", "Age"] +spark_df = spark.createDataFrame(data, schema=columns) +gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" + +spark_df.coalesce(1).write + .mode("overwrite") + .option("header", "true") + .csv(gvfs_path) +``` + +If your Spark **without Hadoop environment**, you can use the following code snippet to access the fileset: + +```python +## Replace the following code snippet with the above code snippet with the same environment variables +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aws-bundle-${gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-${gravitino-version}-SNAPSHOT.jar --master local[1] pyspark-shell" +``` + +- [`gravitino-aws-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aws-bundle) is the Gravitino AWS jar with Hadoop environment(3.3.1) and `hadoop-aws` jar. +- [`gravitino-aws-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aws) is a condensed version of the Gravitino AWS bundle jar without Hadoop environment and `hadoop-aws` jar. +- `hadoop-aws-3.2.0.jar` and `aws-java-sdk-bundle-1.11.375.jar` can be found in the Hadoop distribution in the `${HADOOP_HOME}/share/hadoop/tools/lib` directory. + +Please choose the correct jar according to your environment. + +:::note +In some Spark versions, a Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work. If this is the case, you should add the jars to the spark CLASSPATH directly. +::: + +### Accessing a fileset using the Hadoop fs command + +The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. + +1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: + +```xml + + fs.AbstractFileSystem.gvfs.impl + org.apache.gravitino.filesystem.hadoop.Gvfs + + + + fs.gvfs.impl + org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem + + + + fs.gravitino.server.uri + http://localhost:8090 + + + + fs.gravitino.client.metalake + test + + + + s3-endpoint + http://s3.ap-northeast-1.amazonaws.com + + + + s3-access-key-id + access-key + + + + s3-secret-access-key + secret-key + +``` + +2. Add the necessary jars to the Hadoop classpath. + +For S3, you need to add `gravitino-filesystem-hadoop3-runtime-${gravitino-version}.jar`, `gravitino-aws-${gravitino-version}.jar` and `hadoop-aws-${hadoop-version}.jar` located at `${HADOOP_HOME}/share/hadoop/tools/lib/` to Hadoop classpath. + +3. Run the following command to access the fileset: + +```shell +./${HADOOP_HOME}/bin/hadoop dfs -ls gvfs://fileset/s3_catalog/s3_schema/s3_fileset +./${HADOOP_HOME}/bin/hadoop dfs -put /path/to/local/file gvfs://fileset/s3_catalog/s3_schema/s3_fileset +``` + +### Using the GVFS Python client to access a fileset + +In order to access fileset with S3 using the GVFS Python client, apart from [basic GVFS configurations](./how-to-use-gvfs.md#configuration-1), you need to add the following configurations: + +| Configuration item | Description | Default value | Required | Since version | +|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|---------------|----------|------------------| +| `s3_endpoint` | The endpoint of the AWS S3. This configuration is optional for S3 service, but required for other S3-compatible storage services like MinIO. | (none) | No | 0.7.0-incubating | +| `s3_access_key_id` | The access key of the AWS S3. | (none) | Yes | 0.7.0-incubating | +| `s3_secret_access_key` | The secret key of the AWS S3. | (none) | Yes | 0.7.0-incubating | + +:::note +- `s3_endpoint` is an optional configuration for AWS S3, however, it is required for other S3-compatible storage services like MinIO. +- If the catalog has enabled [credential vending](security/credential-vending.md), the properties above can be omitted. +::: + +Please install the `gravitino` package before running the following code: + +```bash +pip install apache-gravitino==${GRAVITINO_VERSION} +``` + +```python +from gravitino import gvfs +options = { + "cache_size": 20, + "cache_expired_time": 3600, + "auth_type": "simple", + "s3_endpoint": "http://localhost:8090", + "s3_access_key_id": "minio", + "s3_secret_access_key": "minio123" +} +fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) +fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") ") +``` + +### Using fileset with pandas + +The following are examples of how to use the pandas library to access the S3 fileset + +```python +import pandas as pd + +storage_options = { + "server_uri": "http://localhost:8090", + "metalake_name": "test", + "options": { + "s3_access_key_id": "access_key", + "s3_secret_access_key": "secret_key", + "s3_endpoint": "http://s3.ap-northeast-1.amazonaws.com" + } +} +ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", + storage_options=storage_options) +ds.head() +``` + +For more use cases, please refer to the [Gravitino Virtual File System](./how-to-use-gvfs.md) document. + +## Fileset with credential vending + +Since 0.8.0-incubating, Gravitino supports credential vending for S3 fileset. If the catalog has been [configured with credential](./security/credential-vending.md), you can access S3 fileset without providing authentication information like `s3-access-key-id` and `s3-secret-access-key` in the properties. + +### How to create a S3 Hadoop catalog with credential enabled + +Apart from configuration method in [create-s3-hadoop-catalog](#configurations-for-s3-hadoop-catalog), properties needed by [s3-credential](./security/credential-vending.md#s3-credentials) should also be set to enable credential vending for S3 fileset. + +### How to access S3 fileset with credential + +If the catalog has been configured with credential, you can access S3 fileset without providing authentication information via GVFS Java/Python client and Spark. Let's see how to access S3 fileset with credential: + +GVFS Java client: + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri", "http://localhost:8090"); +conf.set("fs.gravitino.client.metalake", "test_metalake"); +// No need to set s3-access-key-id and s3-secret-access-key +Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + +Spark: + +```python +spark = SparkSession.builder + .appName("s3_fileset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + # No need to set s3-access-key-id and s3-secret-access-key + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() +``` + +Python client and Hadoop command are similar to the above examples. + + diff --git a/docs/hadoop-catalog.md b/docs/hadoop-catalog.md index cbdae846899..4b951aedc62 100644 --- a/docs/hadoop-catalog.md +++ b/docs/hadoop-catalog.md @@ -9,9 +9,9 @@ license: "This software is licensed under the Apache License version 2." ## Introduction Hadoop catalog is a fileset catalog that using Hadoop Compatible File System (HCFS) to manage -the storage location of the fileset. Currently, it supports local filesystem and HDFS. For -object storage like S3, GCS, Azure Blob Storage and OSS, you can put the hadoop object store jar like -`gravitino-aws-bundle-{gravitino-version}.jar` into the `$GRAVITINO_HOME/catalogs/hadoop/libs` directory to enable the support. +the storage location of the fileset. Currently, it supports the local filesystem and HDFS. Since 0.7.0-incubating, Gravitino supports [S3](hadoop-catalog-with-S3.md), [GCS](hadoop-catalog-with-gcs.md), [OSS](hadoop-catalog-with-oss.md) and [Azure Blob Storage](hadoop-catalog-with-adls.md) through Hadoop catalog. + +The rest of this document will use HDFS or local file as an example to illustrate how to use the Hadoop catalog. For S3, GCS, OSS and Azure Blob Storage, the configuration is similar to HDFS, please refer to the corresponding document for more details. Note that Gravitino uses Hadoop 3 dependencies to build Hadoop catalog. Theoretically, it should be compatible with both Hadoop 2.x and 3.x, since Gravitino doesn't leverage any new features in @@ -23,17 +23,19 @@ Hadoop 3. If there's any compatibility issue, please create an [issue](https://g Besides the [common catalog properties](./gravitino-server-config.md#apache-gravitino-catalog-properties-configuration), the Hadoop catalog has the following properties: -| Property Name | Description | Default Value | Required | Since Version | -|--------------------------------|-----------------------------------------------------------------------------------------------------|---------------|----------|------------------| -| `location` | The storage location managed by Hadoop catalog. | (none) | No | 0.5.0 | -| `filesystem-conn-timeout-secs` | The timeout of getting the file system using Hadoop FileSystem client instance. Time unit: seconds. | 6 | No | 0.8.0-incubating | -| `credential-providers` | The credential provider types, separated by comma. | (none) | No | 0.8.0-incubating | +| Property Name | Description | Default Value | Required | Since Version | +|--------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|----------|------------------| +| `location` | The storage location managed by Hadoop catalog. | (none) | No | 0.5.0 | +| `default-filesystem-provider` | The default filesystem provider of this Hadoop catalog if users do not specify the scheme in the URI. Candidate values are 'builtin-local', 'builtin-hdfs', 's3', 'gcs', 'abs' and 'oss'. Default value is `builtin-local`. For S3, if we set this value to 's3', we can omit the prefix 's3a://' in the location. | `builtin-local` | No | 0.7.0-incubating | +| `filesystem-providers` | The file system providers to add. Users needs to set this configuration to support cloud storage or custom HCFS. For instance, set it to `s3` or a comma separated string that contains `s3` like `gs,s3` to support multiple kinds of fileset including `s3`. | (none) | Yes | 0.7.0-incubating | +| `credential-providers` | The credential provider types, separated by comma. | (none) | No | 0.8.0-incubating | +| `filesystem-conn-timeout-secs` | The timeout of getting the file system using Hadoop FileSystem client instance. Time unit: seconds. | 6 | No | 0.8.0-incubating | Please refer to [Credential vending](./security/credential-vending.md) for more details about credential vending. -Apart from the above properties, to access fileset like HDFS, S3, GCS, OSS or custom fileset, you need to configure the following extra properties. +### HDFS fileset -#### HDFS fileset +Apart from the above properties, to access fileset like HDFS fileset, you need to configure the following extra properties. | Property Name | Description | Default Value | Required | Since Version | |----------------------------------------------------|------------------------------------------------------------------------------------------------|---------------|-------------------------------------------------------------|---------------| @@ -44,66 +46,13 @@ Apart from the above properties, to access fileset like HDFS, S3, GCS, OSS or cu | `authentication.kerberos.check-interval-sec` | The check interval of Kerberos credential for Hadoop catalog. | 60 | No | 0.5.1 | | `authentication.kerberos.keytab-fetch-timeout-sec` | The fetch timeout of retrieving Kerberos keytab from `authentication.kerberos.keytab-uri`. | 60 | No | 0.5.1 | -#### S3 fileset - -| Configuration item | Description | Default value | Required | Since version | -|-------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|---------------------------|------------------| -| `filesystem-providers` | The file system providers to add. Set it to `s3` if it's a S3 fileset, or a comma separated string that contains `s3` like `gs,s3` to support multiple kinds of fileset including `s3`. | (none) | Yes | 0.7.0-incubating | -| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for S3, if we set this value, we can omit the prefix 's3a://' in the location. | `builtin-local` | No | 0.7.0-incubating | -| `s3-endpoint` | The endpoint of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | -| `s3-access-key-id` | The access key of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | -| `s3-secret-access-key` | The secret key of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | - -Please refer to [S3 credentials](./security/credential-vending.md#s3-credentials) for credential related configurations. - -At the same time, you need to place the corresponding bundle jar [`gravitino-aws-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws-bundle/) in the directory `${GRAVITINO_HOME}/catalogs/hadoop/libs`. - -#### GCS fileset - -| Configuration item | Description | Default value | Required | Since version | -|-------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|----------------------------|------------------| -| `filesystem-providers` | The file system providers to add. Set it to `gs` if it's a GCS fileset, a comma separated string that contains `gs` like `gs,s3` to support multiple kinds of fileset including `gs`. | (none) | Yes | 0.7.0-incubating | -| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for GCS, if we set this value, we can omit the prefix 'gs://' in the location. | `builtin-local` | No | 0.7.0-incubating | -| `gcs-service-account-file` | The path of GCS service account JSON file. | (none) | Yes if it's a GCS fileset. | 0.7.0-incubating | - -Please refer to [GCS credentials](./security/credential-vending.md#gcs-credentials) for credential related configurations. - -In the meantime, you need to place the corresponding bundle jar [`gravitino-gcp-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-gcp-bundle/) in the directory `${GRAVITINO_HOME}/catalogs/hadoop/libs`. - -#### OSS fileset - -| Configuration item | Description | Default value | Required | Since version | -|-------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|----------------------------|------------------| -| `filesystem-providers` | The file system providers to add. Set it to `oss` if it's a OSS fileset, or a comma separated string that contains `oss` like `oss,gs,s3` to support multiple kinds of fileset including `oss`. | (none) | Yes | 0.7.0-incubating | -| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for OSS, if we set this value, we can omit the prefix 'oss://' in the location. | `builtin-local` | No | 0.7.0-incubating | -| `oss-endpoint` | The endpoint of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | -| `oss-access-key-id` | The access key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | -| `oss-secret-access-key` | The secret key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | - -Please refer to [OSS credentials](./security/credential-vending.md#oss-credentials) for credential related configurations. - -In the meantime, you need to place the corresponding bundle jar [`gravitino-aliyun-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun-bundle/) in the directory `${GRAVITINO_HOME}/catalogs/hadoop/libs`. - - -#### Azure Blob Storage fileset - -| Configuration item | Description | Default value | Required | Since version | -|-----------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|-------------------------------------------|------------------| -| `filesystem-providers` | The file system providers to add. Set it to `abs` if it's a Azure Blob Storage fileset, or a comma separated string that contains `abs` like `oss,abs,s3` to support multiple kinds of fileset including `abs`. | (none) | Yes | 0.8.0-incubating | -| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for Azure Blob Storage, if we set this value, we can omit the prefix 'abfss://' in the location. | `builtin-local` | No | 0.8.0-incubating | -| `azure-storage-account-name ` | The account name of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | -| `azure-storage-account-key` | The account key of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | - -Please refer to [ADLS credentials](./security/credential-vending.md#adls-credentials) for credential related configurations. - -Similar to the above, you need to place the corresponding bundle jar [`gravitino-azure-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-azure-bundle/) in the directory `${GRAVITINO_HOME}/catalogs/hadoop/libs`. - -:::note -- Gravitino contains builtin file system providers for local file system(`builtin-local`) and HDFS(`builtin-hdfs`), that is to say if `filesystem-providers` is not set, Gravitino will still support local file system and HDFS. Apart from that, you can set the `filesystem-providers` to support other file systems like S3, GCS, OSS or custom file system. -- `default-filesystem-provider` is used to set the default file system provider for the Hadoop catalog. If the user does not specify the scheme in the URI, Gravitino will use the default file system provider to access the fileset. For example, if the default file system provider is set to `builtin-local`, the user can omit the prefix `file:///` in the location. -::: +### Hadoop catalog with Cloud Storage +- For S3, please refer to [Hadoop-catalog-with-s3](./hadoop-catalog-with-s3.md) for more details. +- For GCS, please refer to [Hadoop-catalog-with-gcs](./hadoop-catalog-with-gcs.md) for more details. +- For OSS, please refer to [Hadoop-catalog-with-oss](./hadoop-catalog-with-oss.md) for more details. +- For Azure Blob Storage, please refer to [Hadoop-catalog-with-adls](./hadoop-catalog-with-adls.md) for more details. -#### How to custom your own HCFS file system fileset? +### How to custom your own HCFS file system fileset? Developers and users can custom their own HCFS file system fileset by implementing the `FileSystemProvider` interface in the jar [gravitino-catalog-hadoop](https://repo1.maven.org/maven2/org/apache/gravitino/catalog-hadoop/). The `FileSystemProvider` interface is defined as follows: diff --git a/docs/how-to-use-gvfs.md b/docs/how-to-use-gvfs.md index aff3b74adfd..cbbb67dd37c 100644 --- a/docs/how-to-use-gvfs.md +++ b/docs/how-to-use-gvfs.md @@ -42,7 +42,9 @@ the path mapping and convert automatically. ### Prerequisites -+ A Hadoop environment with HDFS or other Hadoop Compatible File System (HCFS) implementations like S3, GCS, etc. GVFS has been tested against Hadoop 3.3.1. It is recommended to use Hadoop 3.3.1 or later, but it should work with Hadoop 2.x. Please create an [issue](https://www.github.com/apache/gravitino/issues) if you find any compatibility issues. + - GVFS has been tested against Hadoop 3.3.1. It is recommended to use Hadoop 3.3.1 or later, but it should work with Hadoop 2. + x. Please create an [issue](https://www.github.com/apache/gravitino/issues) if you find any + compatibility issues. ### Configuration @@ -64,55 +66,8 @@ the path mapping and convert automatically. | `fs.gravitino.fileset.cache.evictionMillsAfterAccess` | The value of time that the cache expires after accessing in the Gravitino Virtual File System. The value is in `milliseconds`. | `3600000` | No | 0.5.0 | | `fs.gravitino.fileset.cache.evictionMillsAfterAccess` | The value of time that the cache expires after accessing in the Gravitino Virtual File System. The value is in `milliseconds`. | `3600000` | No | 0.5.0 | -Apart from the above properties, to access fileset like S3, GCS, OSS and custom fileset, you need to configure the following extra properties. - -#### S3 fileset - -| Configuration item | Description | Default value | Required | Since version | -|------------------------|-------------------------------|---------------|---------------------------|------------------| -| `s3-endpoint` | The endpoint of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | -| `s3-access-key-id` | The access key of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | -| `s3-secret-access-key` | The secret key of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | - -At the same time, you need to add the corresponding bundle jar -1. [`gravitino-aws-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws-bundle/) in the classpath if no hadoop environment is available, or -2. [`gravitino-aws-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws/) and hadoop-aws jar and other necessary dependencies in the classpath. - - -#### GCS fileset - -| Configuration item | Description | Default value | Required | Since version | -|----------------------------|--------------------------------------------|---------------|----------------------------|------------------| -| `gcs-service-account-file` | The path of GCS service account JSON file. | (none) | Yes if it's a GCS fileset. | 0.7.0-incubating | - -In the meantime, you need to add the corresponding bundle jar -1. [`gravitino-gcp-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-gcp-bundle/) in the classpath if no hadoop environment is available, or -2. or [`gravitino-gcp-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-gcp/) and [gcs-connector jar](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) and other necessary dependencies in the classpath. - - -#### OSS fileset - -| Configuration item | Description | Default value | Required | Since version | -|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|---------------------------|------------------| -| `oss-endpoint` | The endpoint of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | -| `oss-access-key-id` | The access key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | -| `oss-secret-access-key` | The secret key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | - - -In the meantime, you need to place the corresponding bundle jar -1. [`gravitino-aliyun-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun-bundle/) in the classpath if no hadoop environment is available, or -2. [`gravitino-aliyun-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun/) and hadoop-aliyun jar and other necessary dependencies in the classpath. - -#### Azure Blob Storage fileset - -| Configuration item | Description | Default value | Required | Since version | -|-----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------------|------------------| -| `azure-storage-account-name` | The account name of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | -| `azure-storage-account-key` | The account key of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | - -Similar to the above, you need to place the corresponding bundle jar -1. [`gravitino-azure-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-azure-bundle/) in the classpath if no hadoop environment is available, or -2. [`gravitino-azure-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-azure/) and hadoop-azure jar and other necessary dependencies in the classpath. +Apart from the above properties, to access fileset like S3, GCS, OSS and custom fileset, extra properties are needed, please see +[S3 GVFS Java client configurations](./hadoop-catalog-with-s3.md#using-the-gvfs-java-client-to-access-the-fileset), [GCS GVFS Java client configurations](./hadoop-catalog-with-gcs.md#using-the-gvfs-java-client-to-access-the-fileset), [OSS GVFS Java client configurations](./hadoop-catalog-with-oss.md#using-the-gvfs-java-client-to-access-the-fileset) and [Azure Blob Storage GVFS Java client configurations](./hadoop-catalog-with-adls.md#using-the-gvfs-java-client-to-access-the-fileset) for more details. #### Custom fileset Since 0.7.0-incubating, users can define their own fileset type and configure the corresponding properties, for more, please refer to [Custom Fileset](./hadoop-catalog.md#how-to-custom-your-own-hcfs-file-system-fileset). @@ -132,26 +87,10 @@ You can configure these properties in two ways: conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); conf.set("fs.gravitino.server.uri","http://localhost:8090"); conf.set("fs.gravitino.client.metalake","test_metalake"); - - // Optional. It's only for S3 catalog. For GCS and OSS catalog, you should set the corresponding properties. - conf.set("s3-endpoint", "http://localhost:9000"); - conf.set("s3-access-key-id", "minio"); - conf.set("s3-secret-access-key", "minio123"); - Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset_1"); FileSystem fs = filesetPath.getFileSystem(conf); ``` -:::note -If you want to access the S3, GCS, OSS or custom fileset through GVFS, apart from the above properties, you need to place the corresponding bundle jars in the Hadoop environment. -For example, if you want to access the S3 fileset, you need to place -1. The aws hadoop bundle jar [`gravitino-aws-bundle-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws-bundle/) -2. or [`gravitino-aws-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws/), and hadoop-aws jar and other necessary dependencies - -to the classpath, it typically locates in `${HADOOP_HOME}/share/hadoop/common/lib/`). - -::: - 2. Configure the properties in the `core-site.xml` file of the Hadoop environment: ```xml @@ -174,20 +113,6 @@ to the classpath, it typically locates in `${HADOOP_HOME}/share/hadoop/common/li fs.gravitino.client.metalake test_metalake - - - - s3-endpoint - http://localhost:9000 - - - s3-access-key-id - minio - - - s3-secret-access-key - minio123 - ``` ### Usage examples @@ -223,12 +148,6 @@ cp gravitino-filesystem-hadoop3-runtime-{version}.jar ${HADOOP_HOME}/share/hadoo # You need to ensure that the Kerberos has permission on the HDFS directory. kinit -kt your_kerberos.keytab your_kerberos@xxx.com - -# 4. Copy other dependencies to the Hadoop environment if you want to access the S3 fileset via GVFS -cp bundles/aws-bundle/build/libs/gravitino-aws-bundle-{version}.jar ${HADOOP_HOME}/share/hadoop/common/lib/ -cp clients/filesystem-hadoop3-runtime/build/libs/gravitino-filesystem-hadoop3-runtime-{version}-SNAPSHOT.jar ${HADOOP_HOME}/share/hadoop/common/lib/ -cp ${HADOOP_HOME}/share/hadoop/tools/lib/* ${HADOOP_HOME}/share/hadoop/common/lib/ - # 4. Try to list the fileset ./${HADOOP_HOME}/bin/hadoop dfs -ls gvfs://fileset/test_catalog/test_schema/test_fileset_1 ``` @@ -239,36 +158,6 @@ You can also perform operations on the files or directories managed by fileset t Make sure that your code is using the correct Hadoop environment, and that your environment has the `gravitino-filesystem-hadoop3-runtime-{version}.jar` dependency. -```xml - - - org.apache.gravitino - filesystem-hadoop3-runtime - {gravitino-version} - - - - - org.apache.gravitino - gravitino-aws-bundle - {gravitino-version} - - - - - org.apache.gravitino - gravitino-aws - {gravitino-version} - - - - org.apache.hadoop - hadoop-aws - {hadoop-version} - - -``` - For example: ```java @@ -321,7 +210,6 @@ fs.getFileStatus(filesetPath); rdd.foreach(println) ``` - #### Via Tensorflow For Tensorflow to support GVFS, you need to recompile the [tensorflow-io](https://github.com/tensorflow/io) module. @@ -468,61 +356,14 @@ to recompile the native libraries like `libhdfs` and others, and completely repl | `oauth2_scope` | The auth scope for the Gravitino client when using `oauth2` auth type with the Gravitino Virtual File System. | (none) | Yes if you use `oauth2` auth type | 0.7.0-incubating | | `credential_expiration_ratio` | The ratio of expiration time for credential from Gravitino. This is used in the cases where Gravitino Hadoop catalogs have enable credential vending. if the expiration time of credential fetched from Gravitino is 1 hour, GVFS client will try to refresh the credential in 1 * 0.9 = 0.5 hour. | 0.5 | No | 0.8.0-incubating | +#### Configurations for S3, GCS, OSS and Azure Blob storage fileset -#### Extra configuration for S3, GCS, OSS fileset - -The following properties are required if you want to access the S3 fileset via the GVFS python client: - -| Configuration item | Description | Default value | Required | Since version | -|----------------------------|------------------------------|---------------|--------------------------|------------------| -| `s3_endpoint` | The endpoint of the AWS S3. | (none) | Yes if it's a S3 fileset.| 0.7.0-incubating | -| `s3_access_key_id` | The access key of the AWS S3.| (none) | Yes if it's a S3 fileset.| 0.7.0-incubating | -| `s3_secret_access_key` | The secret key of the AWS S3.| (none) | Yes if it's a S3 fileset.| 0.7.0-incubating | - -The following properties are required if you want to access the GCS fileset via the GVFS python client: - -| Configuration item | Description | Default value | Required | Since version | -|----------------------------|-------------------------------------------|---------------|---------------------------|------------------| -| `gcs_service_account_file` | The path of GCS service account JSON file.| (none) | Yes if it's a GCS fileset.| 0.7.0-incubating | - -The following properties are required if you want to access the OSS fileset via the GVFS python client: - -| Configuration item | Description | Default value | Required | Since version | -|----------------------------|-----------------------------------|---------------|----------------------------|------------------| -| `oss_endpoint` | The endpoint of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | -| `oss_access_key_id` | The access key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | -| `oss_secret_access_key` | The secret key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | - -For Azure Blob Storage fileset, you need to configure the following properties: - -| Configuration item | Description | Default value | Required | Since version | -|--------------------|----------------------------------------|---------------|-------------------------------------------|------------------| -| `abs_account_name` | The account name of Azure Blob Storage | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | -| `abs_account_key` | The account key of Azure Blob Storage | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | - - -You can configure these properties when obtaining the `Gravitino Virtual FileSystem` in Python like this: - -```python -from gravitino import gvfs -options = { - "cache_size": 20, - "cache_expired_time": 3600, - "auth_type": "simple", - # Optional, the following properties are required if you want to access the S3 fileset via GVFS python client, for GCS and OSS fileset, you should set the corresponding properties. - "s3_endpoint": "http://localhost:9000", - "s3_access_key_id": "minio", - "s3_secret_access_key": "minio123" -} -fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) -``` +Please see the cloud-storage-specific configurations [GCS GVFS Java client configurations](./hadoop-catalog-with-gcs.md#using-the-gvfs-python-client-to-access-a-fileset), [S3 GVFS Java client configurations](./hadoop-catalog-with-s3.md#using-the-gvfs-python-client-to-access-a-fileset), [OSS GVFS Java client configurations](./hadoop-catalog-with-oss.md#using-the-gvfs-python-client-to-access-a-fileset) and [Azure Blob Storage GVFS Java client configurations](./hadoop-catalog-with-adls.md#using-the-gvfs-python-client-to-access-a-fileset) for more details. :::note - Gravitino python client does not support [customized file systems](hadoop-catalog.md#how-to-custom-your-own-hcfs-file-system-fileset) defined by users due to the limit of `fsspec` library. ::: - ### Usage examples 1. Make sure to obtain the Gravitino library. diff --git a/docs/manage-fileset-metadata-using-gravitino.md b/docs/manage-fileset-metadata-using-gravitino.md index 9d96287b564..0ff84c83461 100644 --- a/docs/manage-fileset-metadata-using-gravitino.md +++ b/docs/manage-fileset-metadata-using-gravitino.md @@ -15,7 +15,9 @@ filesets to manage non-tabular data like training datasets and other raw data. Typically, a fileset is mapped to a directory on a file system like HDFS, S3, ADLS, GCS, etc. With the fileset managed by Gravitino, the non-tabular data can be managed as assets together with -tabular data in Gravitino in a unified way. +tabular data in Gravitino in a unified way. The following operations will use HDFS as an example, for other +HCFS like S3, OSS, GCS, etc, please refer to the corresponding operations [hadoop-with-s3](./hadoop-catalog-with-s3.md), [hadoop-with-oss](./hadoop-catalog-with-oss.md), [hadoop-with-gcs](./hadoop-catalog-with-gcs.md) and +[hadoop-with-adls](./hadoop-catalog-with-adls.md). After a fileset is created, users can easily access, manage the files/directories through the fileset's identifier, without needing to know the physical path of the managed dataset. Also, with @@ -53,24 +55,6 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ } }' http://localhost:8090/api/metalakes/metalake/catalogs -# create a S3 catalog -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "catalog", - "type": "FILESET", - "comment": "comment", - "provider": "hadoop", - "properties": { - "location": "s3a://bucket/root", - "s3-access-key-id": "access_key", - "s3-secret-access-key": "secret_key", - "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com", - "filesystem-providers": "s3" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs - -# For others HCFS like GCS, OSS, etc., the properties should be set accordingly. please refer to -# The following link about the catalog properties. ``` @@ -93,25 +77,8 @@ Catalog catalog = gravitinoClient.createCatalog("catalog", "hadoop", // provider, Gravitino only supports "hadoop" for now. "This is a Hadoop fileset catalog", properties); - -// create a S3 catalog -s3Properties = ImmutableMap.builder() - .put("location", "s3a://bucket/root") - .put("s3-access-key-id", "access_key") - .put("s3-secret-access-key", "secret_key") - .put("s3-endpoint", "http://s3.ap-northeast-1.amazonaws.com") - .put("filesystem-providers", "s3") - .build(); - -Catalog s3Catalog = gravitinoClient.createCatalog("catalog", - Type.FILESET, - "hadoop", // provider, Gravitino only supports "hadoop" for now. - "This is a S3 fileset catalog", - s3Properties); // ... -// For others HCFS like GCS, OSS, etc., the properties should be set accordingly. please refer to -// The following link about the catalog properties. ``` @@ -124,23 +91,6 @@ catalog = gravitino_client.create_catalog(name="catalog", provider="hadoop", comment="This is a Hadoop fileset catalog", properties={"location": "/tmp/test1"}) - -# create a S3 catalog -s3_properties = { - "location": "s3a://bucket/root", - "s3-access-key-id": "access_key" - "s3-secret-access-key": "secret_key", - "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com" -} - -s3_catalog = gravitino_client.create_catalog(name="catalog", - type=Catalog.Type.FILESET, - provider="hadoop", - comment="This is a S3 fileset catalog", - properties=s3_properties) - -# For others HCFS like GCS, OSS, etc., the properties should be set accordingly. please refer to -# The following link about the catalog properties. ``` @@ -371,11 +321,8 @@ The `storageLocation` is the physical location of the fileset. Users can specify when creating a fileset, or follow the rules of the catalog/schema location if not specified. The value of `storageLocation` depends on the configuration settings of the catalog: -- If this is a S3 fileset catalog, the `storageLocation` should be in the format of `s3a://bucket-name/path/to/fileset`. -- If this is an OSS fileset catalog, the `storageLocation` should be in the format of `oss://bucket-name/path/to/fileset`. - If this is a local fileset catalog, the `storageLocation` should be in the format of `file:///path/to/fileset`. - If this is a HDFS fileset catalog, the `storageLocation` should be in the format of `hdfs://namenode:port/path/to/fileset`. -- If this is a GCS fileset catalog, the `storageLocation` should be in the format of `gs://bucket-name/path/to/fileset`. For a `MANAGED` fileset, the storage location is: From 5cffeb42d5e85c08c595946572f94c6cd2d44cf9 Mon Sep 17 00:00:00 2001 From: FANNG Date: Tue, 14 Jan 2025 21:29:55 +0800 Subject: [PATCH 21/40] [#6229] docs: add fileset credential vending example (#6231) ### What changes were proposed in this pull request? add credential vending document for fileset ### Why are the changes needed? Fix: #6229 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? just document --- docs/hadoop-catalog-with-adls.md | 26 +++++++++++++++++++++++--- docs/hadoop-catalog-with-gcs.md | 22 +++++++++++++++++++--- docs/hadoop-catalog-with-oss.md | 26 +++++++++++++++++++++++--- docs/hadoop-catalog-with-s3.md | 26 +++++++++++++++++++++++--- 4 files changed, 88 insertions(+), 12 deletions(-) diff --git a/docs/hadoop-catalog-with-adls.md b/docs/hadoop-catalog-with-adls.md index 96126c6fab9..880166776fd 100644 --- a/docs/hadoop-catalog-with-adls.md +++ b/docs/hadoop-catalog-with-adls.md @@ -480,11 +480,31 @@ For other use cases, please refer to the [Gravitino Virtual File System](./how-t Since 0.8.0-incubating, Gravitino supports credential vending for ADLS fileset. If the catalog has been [configured with credential](./security/credential-vending.md), you can access ADLS fileset without providing authentication information like `azure-storage-account-name` and `azure-storage-account-key` in the properties. -### How to create an ADLS Hadoop catalog with credential enabled +### How to create an ADLS Hadoop catalog with credential vending -Apart from configuration method in [create-adls-hadoop-catalog](#configuration-for-a-adls-hadoop-catalog), properties needed by [adls-credential](./security/credential-vending.md#adls-credentials) should also be set to enable credential vending for ADLS fileset. +Apart from configuration method in [create-adls-hadoop-catalog](#configuration-for-a-adls-hadoop-catalog), properties needed by [adls-credential](./security/credential-vending.md#adls-credentials) should also be set to enable credential vending for ADLS fileset. Take `adls-token` credential provider for example: -### How to access ADLS fileset with credential +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "adls-catalog-with-token", + "type": "FILESET", + "comment": "This is a ADLS fileset catalog", + "provider": "hadoop", + "properties": { + "location": "abfss://container@account-name.dfs.core.windows.net/path", + "azure-storage-account-name": "The account name of the Azure Blob Storage", + "azure-storage-account-key": "The account key of the Azure Blob Storage", + "filesystem-providers": "abs", + "credential-providers": "adls-token", + "azure-tenant-id":"The Azure tenant id", + "azure-client-id":"The Azure client id", + "azure-client-secret":"The Azure client secret key" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + +### How to access ADLS fileset with credential vending If the catalog has been configured with credential, you can access ADLS fileset without providing authentication information via GVFS Java/Python client and Spark. Let's see how to access ADLS fileset with credential: diff --git a/docs/hadoop-catalog-with-gcs.md b/docs/hadoop-catalog-with-gcs.md index a3eb034b4fe..5422047efd8 100644 --- a/docs/hadoop-catalog-with-gcs.md +++ b/docs/hadoop-catalog-with-gcs.md @@ -459,11 +459,27 @@ For other use cases, please refer to the [Gravitino Virtual File System](./how-t Since 0.8.0-incubating, Gravitino supports credential vending for GCS fileset. If the catalog has been [configured with credential](./security/credential-vending.md), you can access GCS fileset without providing authentication information like `gcs-service-account-file` in the properties. -### How to create a GCS Hadoop catalog with credential enabled +### How to create a GCS Hadoop catalog with credential vending -Apart from configuration method in [create-gcs-hadoop-catalog](#configurations-for-a-gcs-hadoop-catalog), properties needed by [gcs-credential](./security/credential-vending.md#gcs-credentials) should also be set to enable credential vending for GCS fileset. +Apart from configuration method in [create-gcs-hadoop-catalog](#configurations-for-a-gcs-hadoop-catalog), properties needed by [gcs-credential](./security/credential-vending.md#gcs-credentials) should also be set to enable credential vending for GCS fileset. Take `gcs-token` credential provider for example: -### How to access GCS fileset with credential +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "gcs-catalog-with-token", + "type": "FILESET", + "comment": "This is a GCS fileset catalog", + "provider": "hadoop", + "properties": { + "location": "gs://bucket/root", + "gcs-service-account-file": "path_of_gcs_service_account_file", + "filesystem-providers": "gcs", + "credential-providers": "gcs-token" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + +### How to access GCS fileset with credential vending If the catalog has been configured with credential, you can access GCS fileset without providing authentication information via GVFS Java/Python client and Spark. Let's see how to access GCS fileset with credential: diff --git a/docs/hadoop-catalog-with-oss.md b/docs/hadoop-catalog-with-oss.md index e63935c720a..b9ef5f44e27 100644 --- a/docs/hadoop-catalog-with-oss.md +++ b/docs/hadoop-catalog-with-oss.md @@ -495,11 +495,31 @@ For other use cases, please refer to the [Gravitino Virtual File System](./how-t Since 0.8.0-incubating, Gravitino supports credential vending for OSS fileset. If the catalog has been [configured with credential](./security/credential-vending.md), you can access OSS fileset without providing authentication information like `oss-access-key-id` and `oss-secret-access-key` in the properties. -### How to create a OSS Hadoop catalog with credential enabled +### How to create an OSS Hadoop catalog with credential vending -Apart from configuration method in [create-oss-hadoop-catalog](#configuration-for-an-oss-hadoop-catalog), properties needed by [oss-credential](./security/credential-vending.md#oss-credentials) should also be set to enable credential vending for OSS fileset. +Apart from configuration method in [create-oss-hadoop-catalog](#configuration-for-an-oss-hadoop-catalog), properties needed by [oss-credential](./security/credential-vending.md#oss-credentials) should also be set to enable credential vending for OSS fileset. Take `oss-token` credential provider for example: -### How to access OSS fileset with credential +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "oss-catalog-with-token", + "type": "FILESET", + "comment": "This is a OSS fileset catalog", + "provider": "hadoop", + "properties": { + "location": "oss://bucket/root", + "oss-access-key-id": "access_key", + "oss-secret-access-key": "secret_key", + "oss-endpoint": "http://oss-cn-hangzhou.aliyuncs.com", + "filesystem-providers": "oss", + "credential-providers": "oss-token", + "oss-region":"oss-cn-hangzhou", + "oss-role-arn":"The ARN of the role to access the OSS data" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + +### How to access OSS fileset with credential vending If the catalog has been configured with credential, you can access OSS fileset without providing authentication information via GVFS Java/Python client and Spark. Let's see how to access OSS fileset with credential: diff --git a/docs/hadoop-catalog-with-s3.md b/docs/hadoop-catalog-with-s3.md index 7d56f2b9ab8..f1382761894 100644 --- a/docs/hadoop-catalog-with-s3.md +++ b/docs/hadoop-catalog-with-s3.md @@ -498,11 +498,31 @@ For more use cases, please refer to the [Gravitino Virtual File System](./how-to Since 0.8.0-incubating, Gravitino supports credential vending for S3 fileset. If the catalog has been [configured with credential](./security/credential-vending.md), you can access S3 fileset without providing authentication information like `s3-access-key-id` and `s3-secret-access-key` in the properties. -### How to create a S3 Hadoop catalog with credential enabled +### How to create a S3 Hadoop catalog with credential vending -Apart from configuration method in [create-s3-hadoop-catalog](#configurations-for-s3-hadoop-catalog), properties needed by [s3-credential](./security/credential-vending.md#s3-credentials) should also be set to enable credential vending for S3 fileset. +Apart from configuration method in [create-s3-hadoop-catalog](#configurations-for-s3-hadoop-catalog), properties needed by [s3-credential](./security/credential-vending.md#s3-credentials) should also be set to enable credential vending for S3 fileset. Take `s3-token` credential provider for example: -### How to access S3 fileset with credential +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "s3-catalog-with-token", + "type": "FILESET", + "comment": "This is a S3 fileset catalog", + "provider": "hadoop", + "properties": { + "location": "s3a://bucket/root", + "s3-access-key-id": "access_key", + "s3-secret-access-key": "secret_key", + "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com", + "filesystem-providers": "s3", + "credential-providers": "s3-token", + "s3-region":"ap-northeast-1", + "s3-role-arn":"The ARN of the role to access the S3 data" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + +### How to access S3 fileset with credential vending If the catalog has been configured with credential, you can access S3 fileset without providing authentication information via GVFS Java/Python client and Spark. Let's see how to access S3 fileset with credential: From 3a48abad1baafee28531519b6d2ff5c4d92c7c2e Mon Sep 17 00:00:00 2001 From: Lord of Abyss <103809695+Abyss-lord@users.noreply.github.com> Date: Wed, 15 Jan 2025 06:15:43 +0800 Subject: [PATCH 22/40] [#6220] improve(CLI): Clean up GravitinoCommandLine class now it been refactored (#6227) ### What changes were proposed in this pull request? Clean up GravitinoCommandLine class now it been refactored ### Why are the changes needed? Fix: #6220 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? local ut test. --- .../gravitino/cli/ColumnCommandHandler.java | 4 +- .../gravitino/cli/FilesetCommandHandler.java | 4 +- .../gravitino/cli/GravitinoCommandLine.java | 98 +------------------ .../gravitino/cli/SimpleCommandHandler.java | 53 ++++++++++ .../gravitino/cli/TestSimpleCommands.java | 75 ++++++++++++++ 5 files changed, 134 insertions(+), 100 deletions(-) create mode 100644 clients/cli/src/main/java/org/apache/gravitino/cli/SimpleCommandHandler.java create mode 100644 clients/cli/src/test/java/org/apache/gravitino/cli/TestSimpleCommands.java diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/ColumnCommandHandler.java b/clients/cli/src/main/java/org/apache/gravitino/cli/ColumnCommandHandler.java index 96f056c1a3c..c0775dae966 100644 --- a/clients/cli/src/main/java/org/apache/gravitino/cli/ColumnCommandHandler.java +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/ColumnCommandHandler.java @@ -53,7 +53,7 @@ public ColumnCommandHandler( this.command = command; this.ignore = ignore; - this.url = gravitinoCommandLine.getUrl(); + this.url = getUrl(line); this.name = new FullName(line); this.metalake = name.getMetalakeName(); this.catalog = name.getCatalogName(); @@ -65,7 +65,7 @@ public ColumnCommandHandler( @Override protected void handle() { String userName = line.getOptionValue(GravitinoOptions.LOGIN); - Command.setAuthenticationMode(gravitinoCommandLine.getAuth(), userName); + Command.setAuthenticationMode(getAuth(line), userName); List missingEntities = Lists.newArrayList(); if (catalog == null) missingEntities.add(CommandEntities.CATALOG); diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/FilesetCommandHandler.java b/clients/cli/src/main/java/org/apache/gravitino/cli/FilesetCommandHandler.java index 33fc1fe9ee7..dce797294d8 100644 --- a/clients/cli/src/main/java/org/apache/gravitino/cli/FilesetCommandHandler.java +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/FilesetCommandHandler.java @@ -54,7 +54,7 @@ public FilesetCommandHandler( this.command = command; this.ignore = ignore; - this.url = gravitinoCommandLine.getUrl(); + this.url = getUrl(line); this.name = new FullName(line); this.metalake = name.getMetalakeName(); this.catalog = name.getCatalogName(); @@ -65,7 +65,7 @@ public FilesetCommandHandler( @Override protected void handle() { String userName = line.getOptionValue(GravitinoOptions.LOGIN); - Command.setAuthenticationMode(gravitinoCommandLine.getAuth(), userName); + Command.setAuthenticationMode(getAuth(line), userName); List missingEntities = Lists.newArrayList(); if (catalog == null) missingEntities.add(CommandEntities.CATALOG); diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java index 11737206067..d7e257a8a81 100644 --- a/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/GravitinoCommandLine.java @@ -19,13 +19,11 @@ package org.apache.gravitino.cli; -import com.google.common.base.Joiner; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; -import java.util.List; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; @@ -37,18 +35,13 @@ public class GravitinoCommandLine extends TestableCommandLine { private final Options options; private final String entity; private final String command; - private String urlEnv; - private boolean urlSet = false; private boolean ignore = false; private String ignoreEnv; private boolean ignoreSet = false; - private String authEnv; - private boolean authSet = false; public static final String CMD = "gcli"; // recommended name public static final String DEFAULT_URL = "http://localhost:8090"; // This joiner is used to join multiple outputs to be displayed, e.g. roles or groups - private static final Joiner COMMA_JOINER = Joiner.on(", ").skipNulls(); /** * Gravitino Command line. @@ -97,14 +90,8 @@ public void handleSimpleLine() { /* Display command usage. */ if (line.hasOption(GravitinoOptions.HELP)) { displayHelp(options); - } - /* Display Gravitino client version. */ - else if (line.hasOption(GravitinoOptions.VERSION)) { - newClientVersion(getUrl(), ignore).handle(); - } - /* Display Gravitino server version. */ - else if (line.hasOption(GravitinoOptions.SERVER)) { - newServerVersion(getUrl(), ignore).handle(); + } else { + new SimpleCommandHandler(this, line, ignore).handle(); } } @@ -168,85 +155,4 @@ private void handleHelpCommand() { Main.exit(-1); } } - - /** - * Retrieves the Gravitinno URL from the command line options or the GRAVITINO_URL environment - * variable or the Gravitio config file. - * - * @return The Gravitinno URL, or null if not found. - */ - public String getUrl() { - GravitinoConfig config = new GravitinoConfig(null); - - // If specified on the command line use that - if (line.hasOption(GravitinoOptions.URL)) { - return line.getOptionValue(GravitinoOptions.URL); - } - - // Cache the Gravitino URL environment variable - if (urlEnv == null && !urlSet) { - urlEnv = System.getenv("GRAVITINO_URL"); - urlSet = true; - } - - // If set return the Gravitino URL environment variable - if (urlEnv != null) { - return urlEnv; - } - - // Check if the Gravitino URL is specified in the configuration file - if (config.fileExists()) { - config.read(); - String configURL = config.getGravitinoURL(); - if (configURL != null) { - return configURL; - } - } - - // Return the default localhost URL - return DEFAULT_URL; - } - - /** - * Retrieves the Gravitinno authentication from the command line options or the GRAVITINO_AUTH - * environment variable or the Gravitio config file. - * - * @return The Gravitinno authentication, or null if not found. - */ - public String getAuth() { - // If specified on the command line use that - if (line.hasOption(GravitinoOptions.SIMPLE)) { - return GravitinoOptions.SIMPLE; - } - - // Cache the Gravitino authentication type environment variable - if (authEnv == null && !authSet) { - authEnv = System.getenv("GRAVITINO_AUTH"); - authSet = true; - } - - // If set return the Gravitino authentication type environment variable - if (authEnv != null) { - return authEnv; - } - - // Check if the authentication type is specified in the configuration file - GravitinoConfig config = new GravitinoConfig(null); - if (config.fileExists()) { - config.read(); - String configAuthType = config.getGravitinoAuthType(); - if (configAuthType != null) { - return configAuthType; - } - } - - return null; - } - - private void checkEntities(List entities) { - if (!entities.isEmpty()) { - System.err.println(ErrorMessages.MISSING_ENTITIES + COMMA_JOINER.join(entities)); - Main.exit(-1); - } - } } diff --git a/clients/cli/src/main/java/org/apache/gravitino/cli/SimpleCommandHandler.java b/clients/cli/src/main/java/org/apache/gravitino/cli/SimpleCommandHandler.java new file mode 100644 index 00000000000..48aca9f9569 --- /dev/null +++ b/clients/cli/src/main/java/org/apache/gravitino/cli/SimpleCommandHandler.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.cli; + +import org.apache.commons.cli.CommandLine; + +/** Handles the command execution for simple command based on the command line options. */ +public class SimpleCommandHandler extends CommandHandler { + private final GravitinoCommandLine gravitinoCommandLine; + private final CommandLine line; + private final boolean ignore; + + /** + * Constructs a {@link SimpleCommandHandler} instance. + * + * @param gravitinoCommandLine The Gravitino command line instance. + * @param line The command line arguments. + * @param ignore Ignore server version mismatch. + */ + public SimpleCommandHandler( + GravitinoCommandLine gravitinoCommandLine, CommandLine line, boolean ignore) { + this.gravitinoCommandLine = gravitinoCommandLine; + this.line = line; + this.ignore = ignore; + } + + /** Handles the command execution logic based on the provided command. */ + @Override + protected void handle() { + if (line.hasOption(GravitinoOptions.VERSION)) { + gravitinoCommandLine.newClientVersion(getUrl(line), ignore).validate().handle(); + } else if (line.hasOption(GravitinoOptions.SERVER)) { + gravitinoCommandLine.newServerVersion(getUrl(line), ignore).validate().handle(); + } + } +} diff --git a/clients/cli/src/test/java/org/apache/gravitino/cli/TestSimpleCommands.java b/clients/cli/src/test/java/org/apache/gravitino/cli/TestSimpleCommands.java new file mode 100644 index 00000000000..044e06c58f7 --- /dev/null +++ b/clients/cli/src/test/java/org/apache/gravitino/cli/TestSimpleCommands.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.cli; + +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Options; +import org.apache.gravitino.cli.commands.ClientVersion; +import org.apache.gravitino.cli.commands.ServerVersion; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestSimpleCommands { + + private CommandLine mockCommandLine; + private Options mockOptions; + + @BeforeEach + void setUp() { + mockCommandLine = mock(CommandLine.class); + mockOptions = mock(Options.class); + } + + @Test + void testServerVersion() { + ServerVersion mockServerVersion = mock(ServerVersion.class); + when(mockCommandLine.hasOption(GravitinoOptions.SERVER)).thenReturn(true); + GravitinoCommandLine commandLine = + spy(new GravitinoCommandLine(mockCommandLine, mockOptions, null, null)); + + doReturn(mockServerVersion) + .when(commandLine) + .newServerVersion(GravitinoCommandLine.DEFAULT_URL, false); + doReturn(mockServerVersion).when(mockServerVersion).validate(); + commandLine.handleSimpleLine(); + verify(mockServerVersion).handle(); + } + + @Test + void testClientVersion() { + ClientVersion mockClientVersion = mock(ClientVersion.class); + when(mockCommandLine.hasOption(GravitinoOptions.VERSION)).thenReturn(true); + GravitinoCommandLine commandLine = + spy(new GravitinoCommandLine(mockCommandLine, mockOptions, null, null)); + + doReturn(mockClientVersion) + .when(commandLine) + .newClientVersion(GravitinoCommandLine.DEFAULT_URL, false); + doReturn(mockClientVersion).when(mockClientVersion).validate(); + commandLine.handleSimpleLine(); + verify(mockClientVersion).handle(); + } +} From 39ad18afa0738c20071f969955dc01e9d4c6ff24 Mon Sep 17 00:00:00 2001 From: Chun-Hao Liu Date: Wed, 15 Jan 2025 11:12:59 +0800 Subject: [PATCH 23/40] [#5976] Improvement(bin):Add validation checks to the startup scripts to prevent incorrect usage (#5977) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? #5976 - Add file suffix ‘template’ to the following scripts: - bin/gravitino.sh - bin/common.sh - bin/gravitino-iceberg-rest-server.sh - Add a validation check on `GRAVITINO_VERSION` in the script bin/common.sh ( renamed to bin/common.sh.template ) with the followings : ```bash GRAVITINO_VERSION=GRAVITINO_VERSION_PLACEHOLDER if [[ "$GRAVITINO_VERSION" == *_VERSION_PLACEHOLDER ]]; then echo "GRAVITINO_VERSION is not set. Please make sure you are running the script from the distribution/package/bin and before running the script, run './gradle clean build -x test compileDistribution'" exit 1 fi ``` - Update the following tasks in the root build.gradle.kts as described below : - compileDistribution - compileIcebergRESTServer ```bash eachFile { if (name == "gravitino-env.sh" || name == "common.sh") { filter { line -> line.replace("GRAVITINO_VERSION_PLACEHOLDER", "$version") } } } ``` ### Why are the changes needed? To prevent incorrect usage with startup scripts ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - The scripts below will exit with status 1 and print an error message with the correct instructions ```bash cd bin gravitino.sh.template start gravitino-iceberg-rest-server.sh.template start ``` - correct way to run gravitino : ```bash ./gradle clean build -x test compileDistribution cd distribution/package/bin ./gravitino.sh start ./gravitino-iceberg-rest-server.sh start ``` --- bin/{common.sh => common.sh.template} | 9 +++++++++ ...rver.sh => gravitino-iceberg-rest-server.sh.template} | 0 bin/{gravitino.sh => gravitino.sh.template} | 0 docs/getting-started.md | 4 ++-- 4 files changed, 11 insertions(+), 2 deletions(-) rename bin/{common.sh => common.sh.template} (90%) rename bin/{gravitino-iceberg-rest-server.sh => gravitino-iceberg-rest-server.sh.template} (100%) rename bin/{gravitino.sh => gravitino.sh.template} (100%) diff --git a/bin/common.sh b/bin/common.sh.template similarity index 90% rename from bin/common.sh rename to bin/common.sh.template index a6f002ad91d..b81710a3fc5 100644 --- a/bin/common.sh +++ b/bin/common.sh.template @@ -42,6 +42,15 @@ if [[ -f "${GRAVITINO_CONF_DIR}/gravitino-env.sh" ]]; then . "${GRAVITINO_CONF_DIR}/gravitino-env.sh" fi +if [[ -z "${GRAVITINO_VERSION}" ]]; then + echo -e "GRAVITINO_VERSION is not set, you may need to:\n" \ + "1. Ensure that a compiled version of Gravitino is available at " \ + "\${GRAVITINO_HOME}/distribution/package. You may need to compile it first, " \ + "if you are installing the software from source code.\n" \ + "2. Execute gravitino.sh in the \${GRAVITINO_HOME}/distribution/package/bin directory." + exit 1 +fi + GRAVITINO_CLASSPATH+=":${GRAVITINO_CONF_DIR}" JVM_VERSION=8 diff --git a/bin/gravitino-iceberg-rest-server.sh b/bin/gravitino-iceberg-rest-server.sh.template similarity index 100% rename from bin/gravitino-iceberg-rest-server.sh rename to bin/gravitino-iceberg-rest-server.sh.template diff --git a/bin/gravitino.sh b/bin/gravitino.sh.template similarity index 100% rename from bin/gravitino.sh rename to bin/gravitino.sh.template diff --git a/docs/getting-started.md b/docs/getting-started.md index 7b9ce193d25..f729d418acf 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -176,10 +176,10 @@ To use Gravitino locally on macOS or Linux, follow these similar steps: Or, you can install Gravitino from scratch, follow [how-to-build](./how-to-build.md) and [how-to-install](./how-to-install.md). -3. Start Gravitino using the gravitino.sh script: +3. Start Gravitino using the gravitino.sh script in the binary release package or Docker image: ```shell - /bin/gravitino.sh start + ${GRAVITINO_HOME}/bin/gravitino.sh start ``` ## Installing Apache Hive on AWS or Google Cloud Platform From 9ca88e0b06a75366c680610397f136519e8890f4 Mon Sep 17 00:00:00 2001 From: yangyang zhong <35210666+hdygxsj@users.noreply.github.com> Date: Wed, 15 Jan 2025 18:20:25 +0800 Subject: [PATCH 24/40] [#5194] feat(flink): Support basic table DDL Operation for paimon-catalog (#6224) ### What changes were proposed in this pull request? Support basic table DDL Operation for paimon-catalog ### Why are the changes needed? Fix: #5194 ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? org.apache.gravitino.flink.connector.integration.test.paimon.FlinkPaimonCatalogIT --- .../flink/connector/catalog/BaseCatalog.java | 4 +-- .../paimon/GravitinoPaimonCatalog.java | 24 ++++++++++++++++++ .../integration/test/FlinkEnvIT.java | 8 ++---- .../test/hive/FlinkHiveCatalogIT.java | 25 +++++++++++++++++++ .../test/paimon/FlinkPaimonCatalogIT.java | 10 -------- 5 files changed, 53 insertions(+), 18 deletions(-) diff --git a/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/catalog/BaseCatalog.java b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/catalog/BaseCatalog.java index 1496742177f..fd8e118ee49 100644 --- a/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/catalog/BaseCatalog.java +++ b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/catalog/BaseCatalog.java @@ -656,11 +656,11 @@ static SchemaChange[] getSchemaChange(CatalogDatabase current, CatalogDatabase u return schemaChanges.toArray(new SchemaChange[0]); } - private Catalog catalog() { + protected Catalog catalog() { return GravitinoCatalogManager.get().getGravitinoCatalogInfo(getName()); } - private String catalogName() { + protected String catalogName() { return getName(); } } diff --git a/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalog.java b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalog.java index 017ac6e7085..c22e00fa122 100644 --- a/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalog.java +++ b/flink-connector/flink/src/main/java/org/apache/gravitino/flink/connector/paimon/GravitinoPaimonCatalog.java @@ -19,10 +19,17 @@ package org.apache.gravitino.flink.connector.paimon; +import java.util.Optional; import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.factories.Factory; +import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.flink.connector.PartitionConverter; import org.apache.gravitino.flink.connector.PropertiesConverter; import org.apache.gravitino.flink.connector.catalog.BaseCatalog; +import org.apache.paimon.flink.FlinkTableFactory; /** * The GravitinoPaimonCatalog class is an implementation of the BaseCatalog class that is used to @@ -45,4 +52,21 @@ protected GravitinoPaimonCatalog( protected AbstractCatalog realCatalog() { return paimonCatalog; } + + @Override + public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + boolean dropped = + catalog() + .asTableCatalog() + .purgeTable(NameIdentifier.of(tablePath.getDatabaseName(), tablePath.getObjectName())); + if (!dropped && !ignoreIfNotExists) { + throw new TableNotExistException(catalogName(), tablePath); + } + } + + @Override + public Optional getFactory() { + return Optional.of(new FlinkTableFactory()); + } } diff --git a/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/FlinkEnvIT.java b/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/FlinkEnvIT.java index 5ae8847c6c1..f56b5297e17 100644 --- a/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/FlinkEnvIT.java +++ b/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/FlinkEnvIT.java @@ -19,7 +19,6 @@ package org.apache.gravitino.flink.connector.integration.test; import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableMap; import com.google.errorprone.annotations.FormatMethod; import com.google.errorprone.annotations.FormatString; import java.io.IOException; @@ -159,17 +158,14 @@ protected TableResult sql(@FormatString String sql, Object... args) { return tableEnv.executeSql(String.format(sql, args)); } - protected static void doWithSchema( + protected void doWithSchema( Catalog catalog, String schemaName, Consumer action, boolean dropSchema) { Preconditions.checkNotNull(catalog); Preconditions.checkNotNull(schemaName); try { tableEnv.useCatalog(catalog.name()); if (!catalog.asSchemas().schemaExists(schemaName)) { - catalog - .asSchemas() - .createSchema( - schemaName, null, ImmutableMap.of("location", warehouse + "/" + schemaName)); + catalog.asSchemas().createSchema(schemaName, null, null); } tableEnv.useDatabase(schemaName); action.accept(catalog); diff --git a/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/hive/FlinkHiveCatalogIT.java b/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/hive/FlinkHiveCatalogIT.java index 333aa83f0b6..bb7b25f6b20 100644 --- a/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/hive/FlinkHiveCatalogIT.java +++ b/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/hive/FlinkHiveCatalogIT.java @@ -29,6 +29,7 @@ import java.util.Arrays; import java.util.Map; import java.util.Optional; +import java.util.function.Consumer; import java.util.stream.Collectors; import org.apache.flink.configuration.Configuration; import org.apache.flink.table.api.DataTypes; @@ -586,4 +587,28 @@ public void testGetHiveTable() { protected org.apache.gravitino.Catalog currentCatalog() { return hiveCatalog; } + + protected void doWithSchema( + org.apache.gravitino.Catalog catalog, + String schemaName, + Consumer action, + boolean dropSchema) { + Preconditions.checkNotNull(catalog); + Preconditions.checkNotNull(schemaName); + try { + tableEnv.useCatalog(catalog.name()); + if (!catalog.asSchemas().schemaExists(schemaName)) { + catalog + .asSchemas() + .createSchema( + schemaName, null, ImmutableMap.of("location", warehouse + "/" + schemaName)); + } + tableEnv.useDatabase(schemaName); + action.accept(catalog); + } finally { + if (dropSchema) { + catalog.asSchemas().dropSchema(schemaName, true); + } + } + } } diff --git a/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/paimon/FlinkPaimonCatalogIT.java b/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/paimon/FlinkPaimonCatalogIT.java index 10fab3567a3..57a17c2a114 100644 --- a/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/paimon/FlinkPaimonCatalogIT.java +++ b/flink-connector/flink/src/test/java/org/apache/gravitino/flink/connector/integration/test/paimon/FlinkPaimonCatalogIT.java @@ -42,16 +42,6 @@ public class FlinkPaimonCatalogIT extends FlinkCommonIT { private static org.apache.gravitino.Catalog catalog; - @Override - protected boolean supportColumnOperation() { - return false; - } - - @Override - protected boolean supportTableOperation() { - return false; - } - @Override protected boolean supportSchemaOperationWithCommentAndOptions() { return false; From 24c9076acf55915bdfdae5dcb3892cd8dd83d0af Mon Sep 17 00:00:00 2001 From: Xiaojian Sun Date: Wed, 15 Jan 2025 19:08:01 +0800 Subject: [PATCH 25/40] [#6237]fix: add missing @override annotations (#6244) ### What changes were proposed in this pull request? Add missing `@override` annotations ### Why are the changes needed? Fix: https://github.com/apache/gravitino/issues/6237 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? N/A --- .../gravitino/authorization/ranger/RangerClientExtension.java | 2 ++ .../gravitino/authorization/ranger/reference/VXGroup.java | 1 + .../apache/gravitino/authorization/ranger/reference/VXUser.java | 1 + .../catalog/oceanbase/operation/OceanBaseTableOperations.java | 1 + .../java/org/apache/gravitino/hook/MetalakeHookDispatcher.java | 1 + .../gravitino/listener/api/event/CreateTablePreEvent.java | 1 + .../provider/postgresql/CatalogMetaPostgreSQLProvider.java | 1 + .../provider/postgresql/MetalakeMetaPostgreSQLProvider.java | 1 + .../provider/postgresql/SecurableObjectPostgreSQLProvider.java | 1 + .../mapper/provider/postgresql/TagMetaPostgreSQLProvider.java | 1 + 10 files changed, 11 insertions(+) diff --git a/authorizations/authorization-ranger/src/main/java/org/apache/gravitino/authorization/ranger/RangerClientExtension.java b/authorizations/authorization-ranger/src/main/java/org/apache/gravitino/authorization/ranger/RangerClientExtension.java index a554559ea5c..e1e9f6955d2 100644 --- a/authorizations/authorization-ranger/src/main/java/org/apache/gravitino/authorization/ranger/RangerClientExtension.java +++ b/authorizations/authorization-ranger/src/main/java/org/apache/gravitino/authorization/ranger/RangerClientExtension.java @@ -100,12 +100,14 @@ public RangerClientExtension(String hostName, String authType, String username, } } + @Override public RangerPolicy createPolicy(RangerPolicy policy) throws RangerServiceException { Preconditions.checkArgument( policy.getResources().size() > 0, "Ranger policy resources can not be empty!"); return super.createPolicy(policy); } + @Override public RangerPolicy updatePolicy(long policyId, RangerPolicy policy) throws RangerServiceException { Preconditions.checkArgument( diff --git a/authorizations/authorization-ranger/src/main/java/org/apache/gravitino/authorization/ranger/reference/VXGroup.java b/authorizations/authorization-ranger/src/main/java/org/apache/gravitino/authorization/ranger/reference/VXGroup.java index 3a58f5c95a0..611127ec3f2 100644 --- a/authorizations/authorization-ranger/src/main/java/org/apache/gravitino/authorization/ranger/reference/VXGroup.java +++ b/authorizations/authorization-ranger/src/main/java/org/apache/gravitino/authorization/ranger/reference/VXGroup.java @@ -60,6 +60,7 @@ public VXGroup() { * * @return formatedStr */ + @Override public String toString() { String str = "VXGroup={"; str += super.toString(); diff --git a/authorizations/authorization-ranger/src/main/java/org/apache/gravitino/authorization/ranger/reference/VXUser.java b/authorizations/authorization-ranger/src/main/java/org/apache/gravitino/authorization/ranger/reference/VXUser.java index f605d987de0..3dbc2b0236b 100644 --- a/authorizations/authorization-ranger/src/main/java/org/apache/gravitino/authorization/ranger/reference/VXUser.java +++ b/authorizations/authorization-ranger/src/main/java/org/apache/gravitino/authorization/ranger/reference/VXUser.java @@ -75,6 +75,7 @@ public String getName() { * * @return formatedStr */ + @Override public String toString() { String str = "VXUser={"; str += super.toString(); diff --git a/catalogs/catalog-jdbc-oceanbase/src/main/java/org/apache/gravitino/catalog/oceanbase/operation/OceanBaseTableOperations.java b/catalogs/catalog-jdbc-oceanbase/src/main/java/org/apache/gravitino/catalog/oceanbase/operation/OceanBaseTableOperations.java index 77c97290927..98f2d174f1a 100644 --- a/catalogs/catalog-jdbc-oceanbase/src/main/java/org/apache/gravitino/catalog/oceanbase/operation/OceanBaseTableOperations.java +++ b/catalogs/catalog-jdbc-oceanbase/src/main/java/org/apache/gravitino/catalog/oceanbase/operation/OceanBaseTableOperations.java @@ -185,6 +185,7 @@ protected Map getTableProperties(Connection connection, String t } } + @Override protected void correctJdbcTableFields( Connection connection, String databaseName, String tableName, JdbcTable.Builder tableBuilder) throws SQLException { diff --git a/core/src/main/java/org/apache/gravitino/hook/MetalakeHookDispatcher.java b/core/src/main/java/org/apache/gravitino/hook/MetalakeHookDispatcher.java index 26f31a88396..aa53b8800f8 100644 --- a/core/src/main/java/org/apache/gravitino/hook/MetalakeHookDispatcher.java +++ b/core/src/main/java/org/apache/gravitino/hook/MetalakeHookDispatcher.java @@ -116,6 +116,7 @@ public void disableMetalake(NameIdentifier ident) throws NoSuchMetalakeException dispatcher.disableMetalake(ident); } + @Override public boolean dropMetalake(NameIdentifier ident) { // For metalake, we don't clear all the privileges of catalog authorization plugin. // we just remove metalake. diff --git a/core/src/main/java/org/apache/gravitino/listener/api/event/CreateTablePreEvent.java b/core/src/main/java/org/apache/gravitino/listener/api/event/CreateTablePreEvent.java index 6c01d614f3c..dd6b8cc123b 100644 --- a/core/src/main/java/org/apache/gravitino/listener/api/event/CreateTablePreEvent.java +++ b/core/src/main/java/org/apache/gravitino/listener/api/event/CreateTablePreEvent.java @@ -43,6 +43,7 @@ public TableInfo createTableRequest() { return createTableRequest; } + @Override public OperationType operationType() { return OperationType.CREATE_TABLE; } diff --git a/core/src/main/java/org/apache/gravitino/storage/relational/mapper/provider/postgresql/CatalogMetaPostgreSQLProvider.java b/core/src/main/java/org/apache/gravitino/storage/relational/mapper/provider/postgresql/CatalogMetaPostgreSQLProvider.java index abaf2c59af9..77bf3c4e285 100644 --- a/core/src/main/java/org/apache/gravitino/storage/relational/mapper/provider/postgresql/CatalogMetaPostgreSQLProvider.java +++ b/core/src/main/java/org/apache/gravitino/storage/relational/mapper/provider/postgresql/CatalogMetaPostgreSQLProvider.java @@ -76,6 +76,7 @@ public String insertCatalogMetaOnDuplicateKeyUpdate(CatalogPO catalogPO) { + " deleted_at = #{catalogMeta.deletedAt}"; } + @Override public String updateCatalogMeta( @Param("newCatalogMeta") CatalogPO newCatalogPO, @Param("oldCatalogMeta") CatalogPO oldCatalogPO) { diff --git a/core/src/main/java/org/apache/gravitino/storage/relational/mapper/provider/postgresql/MetalakeMetaPostgreSQLProvider.java b/core/src/main/java/org/apache/gravitino/storage/relational/mapper/provider/postgresql/MetalakeMetaPostgreSQLProvider.java index a95d7f09fe3..06dde29751c 100644 --- a/core/src/main/java/org/apache/gravitino/storage/relational/mapper/provider/postgresql/MetalakeMetaPostgreSQLProvider.java +++ b/core/src/main/java/org/apache/gravitino/storage/relational/mapper/provider/postgresql/MetalakeMetaPostgreSQLProvider.java @@ -62,6 +62,7 @@ public String insertMetalakeMetaOnDuplicateKeyUpdate(MetalakePO metalakePO) { + " deleted_at = #{metalakeMeta.deletedAt}"; } + @Override public String updateMetalakeMeta( @Param("newMetalakeMeta") MetalakePO newMetalakePO, @Param("oldMetalakeMeta") MetalakePO oldMetalakePO) { diff --git a/core/src/main/java/org/apache/gravitino/storage/relational/mapper/provider/postgresql/SecurableObjectPostgreSQLProvider.java b/core/src/main/java/org/apache/gravitino/storage/relational/mapper/provider/postgresql/SecurableObjectPostgreSQLProvider.java index 92352bcd95a..6de57dbdc48 100644 --- a/core/src/main/java/org/apache/gravitino/storage/relational/mapper/provider/postgresql/SecurableObjectPostgreSQLProvider.java +++ b/core/src/main/java/org/apache/gravitino/storage/relational/mapper/provider/postgresql/SecurableObjectPostgreSQLProvider.java @@ -32,6 +32,7 @@ import org.apache.ibatis.annotations.Param; public class SecurableObjectPostgreSQLProvider extends SecurableObjectBaseSQLProvider { + @Override public String batchSoftDeleteSecurableObjects( @Param("securableObjects") List securableObjectPOs) { return "