This should be re-produceable using the latest pyarrow 0.11.1 version. (I’m using a compiled from latest source version which fixes an unrelated Pandas metadata issue)
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
modified = datetime.utcnow()
modified = datetime(modified.year, modified.month, modified.day, modified.hour, modified.minute, modified.second, modified.microsecond - modified.microsecond % 1000)
good_columns = {'a': 100, 'b': 100, 'c': 'hello', 'd': 'hello', 'e': 'hello', 'my_date': datetime.date(datetime(2018, 12, 25)), 'f': 100.01000213623047, 'g': 'hello', 'h': 'hello', 'i': datetime.date(datetime(2018, 12, 25)), 'j': 100.01000213623047, 'k': 'hello', 'l': 100.01000213623047, 'm': datetime.date(datetime(2018, 12, 25)), 'n': 'hello', 'o': 'hello', 'p': 'hello', 'q': 'hello', 'r': 'hello', 's': datetime.date(datetime(2018, 12, 25)), 't': 100.01000213623047, 'u': 'hello', 'v': 'hello', 'w': True, 'x': 100.01000213623047, 'y': 'hello', 'z': 100.01000213623047, 'aa': 100.01000213623047, 'bb': 100.01000213623047, 'dd': 100.01000213623047, 'ee': 100.01000213623047, 'ff': 100.01000213623047, 'gg': 100.01000213623047, 'hh': False, 'ii': 100.01000213623047, 'jj': 'hello', 'kk': 'hello', 'll': 'hello', 'mm': 'hello', 'nn': 'hello', 'oo': 'hello', 'pp': 'hello', 'qq': 'hello', 'rr': 100, 'ss': 100.01000213623047, 'tt': 'hello', 'uu': 'hello', 'vv': modified}
schema = pa.schema([
pa.field('a', pa.int32()),
pa.field('b', pa.int32()),
pa.field('c', pa.string()),
pa.field('d', pa.string()),
pa.field('e', pa.string()),
pa.field('my_date', pa.date32()),
pa.field('f', pa.float32()),
pa.field('g', pa.string()),
pa.field('h', pa.string()),
pa.field('i', pa.date32()),
pa.field('j', pa.float32()),
pa.field('k', pa.string()),
pa.field('l', pa.float32()),
pa.field('m', pa.date32()),
pa.field('n', pa.string()),
pa.field('o', pa.string()),
pa.field('p', pa.string()),
pa.field('q', pa.string()),
pa.field('r', pa.string()),
pa.field('s', pa.date32()),
pa.field('t', pa.float32()),
pa.field('u', pa.string()),
pa.field('v', pa.string()),
pa.field('w', pa.bool_()),
pa.field('x', pa.float32()),
pa.field('y', pa.string()),
pa.field('z', pa.float32()),
pa.field('aa', pa.float32()),
pa.field('bb', pa.float32()),
pa.field('dd', pa.float32()),
pa.field('ee', pa.float32()),
pa.field('ff', pa.float32()),
pa.field('gg', pa.float32()),
pa.field('hh', pa.bool_()),
pa.field('ii', pa.float32()),
pa.field('jj', pa.string()),
pa.field('kk', pa.string()),
pa.field('ll', pa.string()),
pa.field('mm', pa.string()),
pa.field('nn', pa.string()),
pa.field('oo', pa.string()),
pa.field('pp', pa.string()),
pa.field('qq', pa.string()),
pa.field('rr', pa.int32()),
pa.field('ss', pa.float32()),
pa.field('tt', pa.string()),
pa.field('uu', pa.string()),
pa.field('vv', pa.timestamp('ms'))
])
arrow_columns = list()
for column in schema.names:
arrow_columns.append(pa.array([good_columns[column]] * 10, type=schema.types[schema.get_field_index(column)]))
good_arrow_table = pa.Table.from_arrays(arrow_columns, names=schema.names)
bad_column = pa.array([None] * 10, type=schema.types[schema.get_field_index('my_date')])
arrow_columns[5] = bad_column
bad_arrow_table = pa.Table.from_arrays(arrow_columns, names=schema.names)
writer = pq.ParquetWriter("test_good.parquet", schema=schema, use_dictionary=True, compression='snappy')
writer.write_table(good_arrow_table)
writer.write_table(bad_arrow_table)
writer.write_table(good_arrow_table)
writer.close()
writer = pq.ParquetWriter("test_bad.parquet", schema=schema, use_dictionary=True, compression='snappy')
writer.write_table(bad_arrow_table)
writer.write_table(good_arrow_table)
writer.write_table(good_arrow_table)
writer.close()