-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathconfig.yaml.example
284 lines (274 loc) · 12 KB
/
config.yaml.example
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# Example configuration for migrating from Cassandra:
source:
type: cassandra
host: cassandra-server-01
port: 9042
#optional, if not specified None will be used
#localDC: <localdc>
#credentials:
# username: <user>
# password: <pass>
# SSL as per https://github.com/scylladb/spark-cassandra-connector/blob/master/doc/reference.md#cassandra-ssl-connection-options
#sslOptions:
# clientAuthEnabled: false
# enabled: false
# all below are optional! (generally just trustStorePassword and trustStorePath is needed)
# trustStorePassword: <pass>
# trustStorePath: <path>
# trustStoreType: JKS
# keyStorePassword: <pass>
# keyStorePath: <path>
# keyStoreType: JKS
# enabledAlgorithms:
# - TLS_RSA_WITH_AES_128_CBC_SHA
# - TLS_RSA_WITH_AES_256_CBC_SHA
# protocol: TLS
keyspace: stocks
table: stocks
# Consistency Level for the source connection
# Options are: LOCAL_ONE, ONE, LOCAL_QUORUM, QUORUM.
# Connector driver default is LOCAL_ONE. Our recommendation is LOCAL_QUORUM.
# If using ONE or LOCAL_ONE, ensure the source system is fully repaired.
consistencyLevel: LOCAL_QUORUM
# Preserve TTLs and WRITETIMEs of cells in the source database. Note that this
# option is *incompatible* when copying tables with collections (lists, maps, sets).
preserveTimestamps: true
# Number of splits to use - this should be at minimum the amount of cores
# available in the Spark cluster, and optimally more; higher splits will lead
# to more fine-grained resumes. Aim for 8 * (Spark cores).
splitCount: 256
# Number of connections to use to Cassandra when copying
connections: 8
# Number of rows to fetch in each read
fetchSize: 1000
# Optional condition to filter source table data that will be migrated
# where: race_start_date = '2015-05-27' AND race_end_date = '2015-05-27'
# Example for loading from Parquet:
# source:
# type: parquet
# path: s3a://bucket-name/path/to/parquet-directory
# # Optional AWS access/secret key for loading from S3.
# # This section can be left out if running on EC2 instances that have instance profiles with the
# # appropriate permissions.
# credentials:
# accessKey: <user>
# secretKey: <pass>
# # Optional - assume role, see https://docs.aws.amazon.com/IAM/latest/UserGuide/tutorial_cross-account-with-roles.html
# assumeRole:
# arn: <roleArn>
# # Optional - the session name to use. If not set, we use 'scylla-migrator'
# sessionName: <roleSessionName>
# # Optional - specify the region:
# region: <region>
# # Optional - specify the endpoint
# endpoint:
# # Specify the hostname without a protocol
# host: <host>
# port: <port>
# Example for loading from DynamoDB:
# source:
# type: dynamodb
# table: <table name>
# # Optional - load from a custom endpoint:
# endpoint:
# # Specify the hostname without a protocol
# host: <host>
# port: <port>
#
# # Optional - specify the region:
# # region: <region>
#
# # Optional - static credentials:
# credentials:
# accessKey: <user>
# secretKey: <pass>
# # Optional - assume role, see https://docs.aws.amazon.com/IAM/latest/UserGuide/tutorial_cross-account-with-roles.html
# assumeRole:
# arn: <roleArn>
# # Optional - the session name to use. If not set, we use 'scylla-migrator'
# sessionName: <sessionName>
#
# # below controls split factor
# scanSegments: 1
#
# # throttling settings, set based on your capacity (or wanted capacity)
# readThroughput: 1
#
# # The value of dynamodb.throughput.read.percent can be between 0.1 and 1.5, inclusively.
# # 0.5 represents the default read rate, meaning that the job will attempt to consume half of the read capacity of the table.
# # If you increase the value above 0.5, spark will increase the request rate; decreasing the value below 0.5 decreases the read request rate.
# # (The actual read rate will vary, depending on factors such as whether there is a uniform key distribution in the DynamoDB table.)
# throughputReadPercent: 1.0
#
# # how many tasks per executor?
# maxMapTasks: 1
# Example for loading from a DynamoDB S3 export (see https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/S3DataExport.Output.html)
# source:
# type: dynamodb-s3-export
# bucket: <bucket-name>
# # Key of the `manifest-summary.json` object in the bucket
# manifestKey: <manifest-summary-key>
# # Key schema and attribute definitions, see https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_TableCreationParameters.html
# tableDescription:
# attributeDefinitions:
# - name: <attribute-name>
# type: <attribute-type> (see https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_AttributeDefinition.html)
# - ...
# keySchema:
# - name: <key-name>
# type: <key-type> (see https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_KeySchemaElement.html)
# - ...
#
# # Optional - load from a custom endpoint:
# endpoint:
# # Specify the hostname without a protocol
# host: <host>
# port: <port>
# # Optional - specify the region:
# region: <region>
#
# # Optional - static credentials:
# credentials:
# accessKey: <user>
# secretKey: <pass>
# # Optional - assume role, see https://docs.aws.amazon.com/IAM/latest/UserGuide/tutorial_cross-account-with-roles.html
# assumeRole:
# arn: <roleArn>
# # Optional - the session name to use. If not set, we use 'scylla-migrator'
# sessionName: <roleSessionName>
#
# # Optional - use path-style access in S3 (see https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html)
# usePathStyleAccess: true
# Configuration for the database you're copying into
target:
type: scylla
host: scylla
port: 9042
#optional, if not specified None will be used
#localDC: <localdc>
#credentials:
# username: <user>
# password: <pass>
# SSL as per https://github.com/scylladb/spark-cassandra-connector/blob/master/doc/reference.md#cassandra-ssl-connection-options
#sslOptions:
# clientAuthEnabled: false
# enabled: false
# all below are optional! (generally just trustStorePassword and trustStorePath is needed)
# trustStorePassword: <pass>
# trustStorePath: <path>
# trustStoreType: JKS
# keyStorePassword: <pass>
# keyStorePath: <path>
# keyStoreType: JKS
# enabledAlgorithms:
# - TLS_RSA_WITH_AES_128_CBC_SHA
# - TLS_RSA_WITH_AES_256_CBC_SHA
# protocol: TLS
# NOTE: The destination table must have the same schema as the source table.
# If you'd like to rename columns, that's ok - see the renames parameter below.
keyspace: stocks
table: stocks
# Consistency Level for the target connection
# Options are: LOCAL_ONE, ONE, LOCAL_QUORUM, QUORUM.
# Connector driver default is LOCAL_QUORUM.
consistencyLevel: LOCAL_QUORUM
# Number of connections to use to Scylla when copying
connections: 16
# Spark pads decimals with zeros appropriate to their scale. This causes values
# like '3.5' to be copied as '3.5000000000...' to the target. There's no good way
# currently to preserve the original value, so this flag can strip trailing zeros
# on decimal values before they are written.
stripTrailingZerosForDecimals: false
# if we cannot persist timestamps (so preserveTimestamps==false)
# we can enforce in writer a single TTL or writetimestamp for ALL written records
# such writetimestamp can be e.g. set to time BEFORE starting dual writes
# and this will make your migration safe from overwriting dual write
# even for collections
# ALL rows written will get the same TTL or writetimestamp or both
# (you can uncomment just one of them, or all or none)
# TTL in seconds (sample 7776000 is 90 days)
#writeTTLInS: 7776000
# writetime in microseconds (sample 1640998861000 is Saturday, January 1, 2022 2:01:01 AM GMT+01:00 )
#writeWritetimestampInuS: 1640998861000
# Example for loading into a DynamoDB target (for example, Scylla's Alternator):
# target:
# type: dynamodb
# table: <table name>
# # Optional - write to a custom endpoint:
# endpoint:
# # If writing to Scylla Alternator, prefix the hostname with 'http://'.
# host: <host>
# port: <port>
#
# # Optional - specify the region:
# # region: <region>
#
# # Optional - static credentials:
# credentials:
# accessKey: <user>
# secretKey: <pass>
# # Optional - assume role, see https://docs.aws.amazon.com/IAM/latest/UserGuide/tutorial_cross-account-with-roles.html
# assumeRole:
# arn: <roleArn>
# # Optional - the session name to use. If not set, we use 'scylla-migrator'
# sessionName: <roleSessionName>
#
# # throttling settings, set based on your write capacity units (or wanted capacity)
# writeThroughput: 1
#
# # The value of dynamodb.throughput.write.percent can be between 0.1 and 1.5, inclusively.
# # 0.5 represents the default write rate, meaning that the job will attempt to consume half of the write capacity of the table.
# # If you increase the value above 0.5, spark will increase the request rate; decreasing the value below 0.5 decreases the write request rate.
# # (The actual write rate will vary, depending on factors such as whether there is a uniform key distribution in the DynamoDB table.)
# throughputWritePercent: 1.0
#
# # When transferring DynamoDB sources to DynamoDB targets (such as other DynamoDB tables or Alternator tables),
# # the migrator supports transferring live changes occuring on the source table after transferring an initial
# # snapshot. This is done using DynamoDB streams and incurs additional charges due to the Kinesis streams created.
# # Enable this flag to transfer live changes after transferring an initial snapshot. The migrator will continue
# # replicating changes endlessly; it must be stopped manually.
# #
# # NOTE: For the migration to be performed losslessly, the initial snapshot transfer must complete within 24 hours.
# # Otherwise, some captured changes may be lost due to the retention period of the table's stream.
# #
# # NOTE2: The migrator does not currently delete the created Dynamo stream. Delete it manually after ending the
# # migrator run.
# streamChanges: false
#
# # Optional - when streamChanges is true, skip the initial snapshot transfer and only stream changes.
# # This setting is ignored if streamChanges is false.
# #skipInitialSnapshotTransfer: false
# Savepoints are configuration files (like this one), saved by the migrator as it
# runs. Their purpose is to skip token ranges that have already been copied. This
# configuration only applies when copying from Cassandra/Scylla.
savepoints:
# Where should savepoint configurations be stored? This is a path on the host running
# the Spark driver - usually the Spark master.
path: /app/savepoints
# Interval in which savepoints will be created
intervalSeconds: 300
# Optional - Column renaming configuration. If you'd like to rename any columns, specify them like so:
# - from: source_column_name
# to: dest_column_name
# renames: []
# Optional - Which token ranges to skip. You shouldn't need to fill this in normally; the migrator will
# create a savepoint file with this filled.
# skipTokenRanges: []
# Optional - Which scan segments to skip. You shouldn’t need to fill this in normally; the migrator will
# create a savepoint file with this filled.
# skipSegments: []
# Configuration section for running the validator. The validator is run manually (see documentation).
# Mandatory if the application is executed in validation mode.
# validation:
# # Should WRITETIMEs and TTLs be compared?
# compareTimestamps: true
# # What difference should we allow between TTLs?
# ttlToleranceMillis: 60000
# # What difference should we allow between WRITETIMEs?
# writetimeToleranceMillis: 1000
# # How many differences to fetch and print
# failuresToFetch: 100
# # What difference should we allow between floating point numbers?
# floatingPointTolerance: 0.001
# # What difference in ms should we allow between timestamps?
# timestampMsTolerance: 0