@@ -4241,16 +4241,22 @@ def setUp(self):
42414241
42424242 @tm .network
42434243 def test_parse_public_s3_bucket (self ):
4244- import nose .tools as nt
4245- df = pd .read_csv ('s3://nyqpug/tips.csv' )
4246- nt .assert_true (isinstance (df , pd .DataFrame ))
4247- nt .assert_false (df .empty )
4248- tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )), df )
4244+ for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
4245+ if comp == 'bz2' and compat .PY2 :
4246+ # The Python 2 C parser can't read bz2 from S3.
4247+ self .assertRaises (ValueError , pd .read_csv ,
4248+ 's3://pandas-test/tips.csv' + ext ,
4249+ compression = comp )
4250+ else :
4251+ df = pd .read_csv ('s3://pandas-test/tips.csv' + ext , compression = comp )
4252+ self .assertTrue (isinstance (df , pd .DataFrame ))
4253+ self .assertFalse (df .empty )
4254+ tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )), df )
42494255
42504256 # Read public file from bucket with not-public contents
42514257 df = pd .read_csv ('s3://cant_get_it/tips.csv' )
4252- nt . assert_true (isinstance (df , pd .DataFrame ))
4253- nt . assert_false (df .empty )
4258+ self . assertTrue (isinstance (df , pd .DataFrame ))
4259+ self . assertFalse (df .empty )
42544260 tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )), df )
42554261
42564262 @tm .network
@@ -4269,6 +4275,81 @@ def test_parse_public_s3a_bucket(self):
42694275 self .assertFalse (df .empty )
42704276 tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )).iloc [:10 ], df )
42714277
4278+ @tm .network
4279+ def test_parse_public_s3_bucket_nrows (self ):
4280+ for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
4281+ if comp == 'bz2' and compat .PY2 :
4282+ # The Python 2 C parser can't read bz2 from S3.
4283+ self .assertRaises (ValueError , pd .read_csv ,
4284+ 's3://pandas-test/tips.csv' + ext ,
4285+ compression = comp )
4286+ else :
4287+ df = pd .read_csv ('s3://pandas-test/tips.csv' + ext , nrows = 10 , compression = comp )
4288+ self .assertTrue (isinstance (df , pd .DataFrame ))
4289+ self .assertFalse (df .empty )
4290+ tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )).iloc [:10 ], df )
4291+
4292+ @tm .network
4293+ def test_parse_public_s3_bucket_chunked (self ):
4294+ # Read with a chunksize
4295+ chunksize = 5
4296+ local_tips = pd .read_csv (tm .get_data_path ('tips.csv' ))
4297+ for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
4298+ if comp == 'bz2' and compat .PY2 :
4299+ # The Python 2 C parser can't read bz2 from S3.
4300+ self .assertRaises (ValueError , pd .read_csv ,
4301+ 's3://pandas-test/tips.csv' + ext ,
4302+ compression = comp )
4303+ else :
4304+ df_reader = pd .read_csv ('s3://pandas-test/tips.csv' + ext ,
4305+ chunksize = chunksize , compression = comp )
4306+ self .assertEqual (df_reader .chunksize , chunksize )
4307+ for i_chunk in [0 , 1 , 2 ]:
4308+ # Read a couple of chunks and make sure we see them properly.
4309+ df = df_reader .get_chunk ()
4310+ self .assertTrue (isinstance (df , pd .DataFrame ))
4311+ self .assertFalse (df .empty )
4312+ true_df = local_tips .iloc [chunksize * i_chunk : chunksize * (i_chunk + 1 )]
4313+ true_df = true_df .reset_index ().drop ('index' , axis = 1 ) # Chunking doesn't preserve row numbering
4314+ tm .assert_frame_equal (true_df , df )
4315+
4316+ @tm .network
4317+ def test_parse_public_s3_bucket_chunked_python (self ):
4318+ # Read with a chunksize using the Python parser
4319+ chunksize = 5
4320+ local_tips = pd .read_csv (tm .get_data_path ('tips.csv' ))
4321+ for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
4322+ df_reader = pd .read_csv ('s3://pandas-test/tips.csv' + ext ,
4323+ chunksize = chunksize , compression = comp ,
4324+ engine = 'python' )
4325+ self .assertEqual (df_reader .chunksize , chunksize )
4326+ for i_chunk in [0 , 1 , 2 ]:
4327+ # Read a couple of chunks and make sure we see them properly.
4328+ df = df_reader .get_chunk ()
4329+ self .assertTrue (isinstance (df , pd .DataFrame ))
4330+ self .assertFalse (df .empty )
4331+ true_df = local_tips .iloc [chunksize * i_chunk : chunksize * (i_chunk + 1 )]
4332+ true_df = true_df .reset_index ().drop ('index' , axis = 1 ) # Chunking doesn't preserve row numbering
4333+ tm .assert_frame_equal (true_df , df )
4334+
4335+ @tm .network
4336+ def test_parse_public_s3_bucket_python (self ):
4337+ for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
4338+ df = pd .read_csv ('s3://pandas-test/tips.csv' + ext , engine = 'python' ,
4339+ compression = comp )
4340+ self .assertTrue (isinstance (df , pd .DataFrame ))
4341+ self .assertFalse (df .empty )
4342+ tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )), df )
4343+
4344+ @tm .network
4345+ def test_parse_public_s3_bucket_nrows_python (self ):
4346+ for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
4347+ df = pd .read_csv ('s3://pandas-test/tips.csv' + ext , engine = 'python' ,
4348+ nrows = 10 , compression = comp )
4349+ self .assertTrue (isinstance (df , pd .DataFrame ))
4350+ self .assertFalse (df .empty )
4351+ tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )).iloc [:10 ], df )
4352+
42724353 @tm .network
42734354 def test_s3_fails (self ):
42744355 import boto
0 commit comments