How to read a csv file from an s3 bucket using Pandas in Python How to read a csv file from an s3 bucket using Pandas in Python pandas pandas

How to read a csv file from an s3 bucket using Pandas in Python


Using pandas 0.20.3

import osimport boto3import pandas as pdimport sysif sys.version_info[0] < 3:     from StringIO import StringIO # Python 2.xelse:    from io import StringIO # Python 3.x# get your credentials from environment variablesaws_id = os.environ['AWS_ID']aws_secret = os.environ['AWS_SECRET']client = boto3.client('s3', aws_access_key_id=aws_id,        aws_secret_access_key=aws_secret)bucket_name = 'my_bucket'object_key = 'my_file.csv'csv_obj = client.get_object(Bucket=bucket_name, Key=object_key)body = csv_obj['Body']csv_string = body.read().decode('utf-8')df = pd.read_csv(StringIO(csv_string))


Based on this answer that suggested using smart_open for reading from S3, this is how I used it with Pandas:

import osimport pandas as pdfrom smart_open import smart_openaws_key = os.environ['AWS_ACCESS_KEY']aws_secret = os.environ['AWS_SECRET_ACCESS_KEY']bucket_name = 'my_bucket'object_key = 'my_file.csv'path = 's3://{}:{}@{}/{}'.format(aws_key, aws_secret, bucket_name, object_key)df = pd.read_csv(smart_open(path))


You don't need pandas.. you can just use the default csv library of python

def read_file(bucket_name,region, remote_file_name, aws_access_key_id, aws_secret_access_key):    # reads a csv from AWS    # first you stablish connection with your passwords and region id    conn = boto.s3.connect_to_region(        region,        aws_access_key_id=aws_access_key_id,        aws_secret_access_key=aws_secret_access_key)    # next you obtain the key of the csv you want to read    # you will need the bucket name and the csv file name    bucket = conn.get_bucket(bucket_name, validate=False)    key = Key(bucket)    key.key = remote_file_name    data = key.get_contents_as_string()    key.close()    # you store it into a string, therefore you will need to split it    # usually the split characters are '\r\n' if not just read the file normally     # and find out what they are     reader = csv.reader(data.split('\r\n'))    data = []    header = next(reader)    for row in reader:        data.append(row)    return data

hope it solved your problem, good luck!:)