Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[copy_from] AWS source #31144

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 44 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,17 @@ crates_repository(
# Note: This is a target we add from the additive build file above.
deps = [":zlib"],
)],
# TODO(parkmycar): Refactor this to build the version of zlib from the `bzip2-sys` crate.
"bzip2-sys": [crate.annotation(
gen_build_script = False,
deps = ["@bzip2"],
)],
"lzma-sys": [crate.annotation(
additive_build_file = "@//misc/bazel/c_deps:rust-sys/BUILD.lzma-sys.bazel",
gen_build_script = False,
# Note: This is a target we add from the additive build file above.
deps = [":xz"],
)],
"openssl-sys": [crate.annotation(
build_script_data = [
"@openssl//:openssl_lib",
Expand Down
68 changes: 68 additions & 0 deletions misc/bazel/c_deps/rust-sys/BUILD.lzma-sys.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright Materialize, Inc. and contributors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License in the LICENSE file at the
# root of this repository, or online at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Builds xz from the lzma-sys crate."""

cc_library(
name = "xz",
srcs = glob(
include = [
"xz-5.2/src/liblzma/api/**/*.c",
"xz-5.2/src/liblzma/lzma/**/*.c",
"xz-5.2/src/liblzma/lzma/**/*.h",
"xz-5.2/src/liblzma/lz/**/*.c",
"xz-5.2/src/liblzma/lz/**/*.h",
"xz-5.2/src/liblzma/check/**/*.c",
"xz-5.2/src/liblzma/check/**/*.h",
"xz-5.2/src/liblzma/simple/**/*.c",
"xz-5.2/src/liblzma/simple/**/*.h",
"xz-5.2/src/liblzma/delta/**/*.c",
"xz-5.2/src/liblzma/delta/**/*.h",
"xz-5.2/src/liblzma/common/**/*.c",
"xz-5.2/src/liblzma/common/**/*.h",
"xz-5.2/src/liblzma/rangecoder/**/*.c",
"xz-5.2/src/liblzma/rangecoder/**/*.h",
"xz-5.2/src/common/**/*.h",
],
exclude = [
"**/*crc32_small*",
"**/*crc64_small*",
"**/*tablegen*",
],
) + [
"config.h",
"xz-5.2/src/common/tuklib_cpucores.c",
"xz-5.2/src/common/tuklib_physmem.c",
],
hdrs = glob(["xz-5.2/src/liblzma/api/**/*.h"]),
copts = [
"-std=c99",
"-pthread",
],
includes = [
"xz-5.2/src/common",
"xz-5.2/src/liblzma/api",
"xz-5.2/src/liblzma/check",
"xz-5.2/src/liblzma/common",
"xz-5.2/src/liblzma/delta",
"xz-5.2/src/liblzma/lz",
"xz-5.2/src/liblzma/lzma",
"xz-5.2/src/liblzma/rangecoder",
"xz-5.2/src/liblzma/simple",
# The current working directory.
"",
],
local_defines = ["HAVE_CONFIG_H=1"],
)
2 changes: 1 addition & 1 deletion src/adapter/src/coord/sequencer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ impl Coordinator {
session,
);
}
CopyFromSource::Url(_) => {
CopyFromSource::Url(_) | CopyFromSource::AwsS3 { .. } => {
self.sequence_copy_from(ctx, plan, target_cluster).await;
}
},
Expand Down
98 changes: 83 additions & 15 deletions src/adapter/src/coord/sequencer/inner/copy_from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0.

use std::str::FromStr;

use mz_adapter_types::connection::ConnectionId;
use mz_ore::cast::CastInto;
use mz_persist_client::batch::ProtoBatch;
use mz_pgcopy::CopyFormatParams;
use mz_repr::{CatalogItemId, Datum, RowArena};
use mz_sql::plan::{self, CopyFromSource, HirScalarExpr};
use mz_sql::plan::{self, CopyFromFilter, CopyFromSource, HirScalarExpr};
use mz_sql::session::metadata::SessionMetadata;
use mz_storage_client::client::TableData;
use mz_storage_types::oneshot_sources::OneshotIngestionRequest;
Expand All @@ -36,17 +39,11 @@ impl Coordinator {
id,
source,
columns: _,
params: _,
params,
filter,
} = plan;

let from_expr = match source {
CopyFromSource::Url(from_expr) => from_expr,
CopyFromSource::Stdin => {
unreachable!("COPY FROM STDIN should be handled elsewhere")
}
};

let eval_url = |from: HirScalarExpr| -> Result<Url, AdapterError> {
let eval_uri = |from: HirScalarExpr| -> Result<String, AdapterError> {
let style = ExprPrepStyle::OneShot {
logical_time: EvalTime::NotAvailable,
session: ctx.session(),
Expand All @@ -65,10 +62,8 @@ impl Coordinator {
other => coord_bail!("programming error! COPY FROM target cannot be {other}"),
};

Url::parse(eval_string)
.map_err(|err| AdapterError::Unstructured(anyhow::anyhow!("{err}")))
Ok(eval_string.to_string())
};
let url = return_if_err!(eval_url(from_expr), ctx);

// We check in planning that we're copying into a Table, but be defensive.
let Some(dest_table) = self.catalog().get_entry(&id).table() else {
Expand All @@ -80,9 +75,82 @@ impl Coordinator {
// Generate a unique UUID for our ingestion.
let ingestion_id = Uuid::new_v4();
let collection_id = dest_table.global_id_writes();

let format = match params {
CopyFormatParams::Csv(csv) => {
mz_storage_types::oneshot_sources::ContentFormat::Csv(csv.to_owned())
}
CopyFormatParams::Text(_) | CopyFormatParams::Binary => {
mz_ore::soft_panic_or_log!("unsupported formats should be rejected in planning");
ctx.retire(Err(AdapterError::Unsupported("COPY FROM URL format")));
return;
}
};

let source = match source {
CopyFromSource::Url(from_expr) => {
let url = return_if_err!(eval_uri(from_expr), ctx);
// TODO(cf2): Structured errors.
let result = Url::parse(&url)
.map_err(|err| AdapterError::Unstructured(anyhow::anyhow!("{err}")));
let url = return_if_err!(result, ctx);

mz_storage_types::oneshot_sources::ContentSource::Http { url }
}
CopyFromSource::AwsS3 {
uri,
connection,
connection_id,
} => {
let uri = return_if_err!(eval_uri(uri), ctx);

// Validate the URI is an S3 URI, with a bucket name. We rely on validating here
// and expect it in clusterd.
//
// TODO(cf2): Structured errors.
let result = http::Uri::from_str(&uri)
.map_err(|err| {
AdapterError::Unstructured(anyhow::anyhow!("expected S3 uri: {err}"))
})
.and_then(|uri| {
if uri.scheme_str() != Some("s3") {
coord_bail!("only 's3://...' urls are supported as COPY FROM target");
}
Ok(uri)
})
.and_then(|uri| {
if uri.host().is_none() {
coord_bail!("missing bucket name from 's3://...' url");
}
Ok(uri)
});
let uri = return_if_err!(result, ctx);

mz_storage_types::oneshot_sources::ContentSource::AwsS3 {
connection,
connection_id,
uri: uri.to_string(),
}
}
CopyFromSource::Stdin => {
unreachable!("COPY FROM STDIN should be handled elsewhere")
}
};

let filter = match filter {
None => mz_storage_types::oneshot_sources::ContentFilter::None,
Some(CopyFromFilter::Files(files)) => {
mz_storage_types::oneshot_sources::ContentFilter::Files(files)
}
Some(CopyFromFilter::Pattern(pattern)) => {
mz_storage_types::oneshot_sources::ContentFilter::Pattern(pattern)
}
};

let request = OneshotIngestionRequest {
source: mz_storage_types::oneshot_sources::ContentSource::Http { url },
format: mz_storage_types::oneshot_sources::ContentFormat::Csv,
source,
format,
filter,
};

let target_cluster = match self
Expand Down
9 changes: 7 additions & 2 deletions src/aws-util/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,20 @@ workspace = true
[dependencies]
anyhow = "1.0.66"
aws-config = { version = "1.2.0", default-features = false }
aws-sdk-s3 = { version = "1.23.0", default-features = false, features = ["rt-tokio"], optional = true }
aws-sdk-s3 = { version = "1.23.0", default-features = false, features = [
"rt-tokio",
], optional = true }
aws-smithy-runtime-api = "1.1.1"
aws-smithy-runtime = { version = "1.1.1", features = ["connector-hyper-0-14-x"] }
aws-smithy-types = { version = "1.1.8", features = ["byte-stream-poll-next"] }
aws-types = "1.1.1"
bytes = "1.3.0"
bytesize = "1.1.0"
futures = "0.3.25"
http = "1.1.0"
hyper-tls = "0.5.0"
mz-ore = { path = "../ore", default-features = false }
mz-ore = { path = "../ore", features = ["async"], default-features = false }
pin-project = "1.0.12"
thiserror = "1.0.37"
tokio = { version = "1.38.0", default-features = false, features = ["macros"] }
uuid = { version = "1.7.0", features = ["v4"] }
Expand Down
Loading
Loading