Source code for getml.database.read_s3

# Copyright 2022 The SQLNet Company GmbH
#
# This file is licensed under the Elastic License 2.0 (ELv2).
# Refer to the LICENSE.txt file in the root of the repository
# for details.
#


"""
Reads a list of CSV files located in an S3 bucket.
"""

from typing import Any, Dict, List, Optional

import getml.communication as comm

from .connection import Connection


[docs]def read_s3( name: str, bucket: str, keys: List[str], region: str, sep: str = ",", num_lines_read: int = 0, skip: int = 0, colnames: Optional[List[str]] = None, conn: Optional[Connection] = None, ): """ Reads a list of CSV files located in an S3 bucket. Example: Let's assume you have two CSV files - *file1.csv* and *file2.csv* - in the bucket. You can import their data into the getML engine using the following commands: >>> getml.engine.set_s3_access_key_id("YOUR-ACCESS-KEY-ID") >>> >>> getml.engine.set_s3_secret_access_key("YOUR-SECRET-ACCESS-KEY") >>> >>> stmt = data.database.sniff_s3( ... bucket="your-bucket-name", ... keys=["file1.csv", "file2.csv"], ... region="us-east-2", ... name="MY_TABLE", ... sep=';' ... ) >>> >>> getml.database.execute(stmt) >>> >>> stmt = data.database.read_s3( ... bucket="your-bucket-name", ... keys=["file1.csv", "file2.csv"], ... region="us-east-2", ... name="MY_TABLE", ... sep=';' ... ) You can also set the access credential as environment variables before you launch the getML engine. Args: name (str): Name of the table in which the data is to be inserted. bucket (str): The bucket from which to read the files. keys (List[str]): The list of keys (files in the bucket) to be read. region (str): The region in which the bucket is located. sep (str, optional): The separator used for separating fields. Default:`,` num_lines_read (int, optional): Number of lines read from each file. Set to 0 to read in the entire file. skip (int, optional): Number of lines to skip at the beginning of each file (Default: 0). colnames(List[str] or None, optional): The first line of a CSV file usually contains the column names. When this is not the case, you need to explicitly pass them. conn (:class:`~getml.database.Connection`, optional): The database connection to be used. If you don't explicitly pass a connection, the engine will use the default connection. """ # ------------------------------------------- conn = conn or Connection() # ------------------------------------------- cmd: Dict[str, Any] = {} cmd["name_"] = name cmd["type_"] = "Database.read_s3" cmd["bucket_"] = bucket cmd["keys_"] = keys cmd["num_lines_read_"] = num_lines_read cmd["region_"] = region cmd["sep_"] = sep cmd["skip_"] = skip cmd["conn_id_"] = conn.conn_id if colnames is not None: cmd["colnames_"] = colnames # ------------------------------------------- comm.send(cmd)