How to use parquet file downloads
The following are examples only, and may need to be adapted to your specific use case
Contents
- Download and extract dataset
- Initial Python setup
- Load dataset information
- Load configurations
- Load configurations with configuration sets
- Links to Parquet file schemas
Download and extract
- Download desired dataset file using Download Dataset Parquet Files button on main search results page or dataset details page.
- This will download a tar.gz file in the format
<dataset_id>.tar.gz
. - Using your command line, navigate to the containing directory and execute
tar -xf <dataset_id>.tar.gz
. - A directory called
<dataset_id>
will be extracted (e.g.DS_a1bc2z345
). - This directory will contain these non-optional items:
<dataset_id>/co/
— A directory of configuration files at containing one or more parquet files, formatted asco_0.parquet co_1.parquet ...
<dataset_id>/ds.parquet
— A file containing dataset details and aggregated information.
- and may contain these optional items:
<dataset_id>/cs/
— A directory containing one or more parquet files of configuration sets, formatted ascs_0.parquet cs_1.parquet ...
<dataset_id>/cs_co_map/
— A directory containing one or more parquet files of configuration set to configuration mapping, formatted ascs_co_map_0.parquet cs_co_map_1.parquet ...
See the file schema pages for a full list of column definitions.
Initial Python setup
from pathlib import Path
dataset_dir = Path('path/to/dataset/directory/<dataset_id>')
dataset_info_file = dataset_dir / 'ds.parquet'
configuration_dir = dataset_dir / 'co'
configuration_set_dir = dataset_dir / 'cs' # If present
config_set_map_dir = dataset_dir / 'cs_co_map' # If present
Load dataset information
import pandas as pd
dataset_info = pd.read_parquet(dataset_info_file)
print(dataset_info.head())
import pyarrow.parquet as pq
dataset_info = pq.read_table(dataset_info_file)
print(dataset_info.to_pydict()[0])
Load configurations
Configurations are the data points of the dataset. Each configuration row contains a structure and its associated calculated properties.
def pd_row_to_atoms(row):
atoms = Atoms(
positions=row['positions'].tolist(),
numbers=row['atomic_numbers'],
cell=row['cell'].tolist(),
pbc=row['pbc'],
info={key:row[key] for key in property_cols}
)
return atoms
def load_configurations_to_atoms(configuration_dir):
config_df = pd.read_parquet(configuration_dir)
config_df = config_df[minimum_structure_cols + property_cols]
return config_df.apply(pd_row_to_atoms, axis=1)
configurations = pd.read_parquet(configuration_dir)
# Accessible by index
first_config = configurations[0]
from ase.atoms import Atoms
def load_configurations_to_atoms(configuration_dir):
co_table = pq.read_table(configuration_dir)
co_table = co_table.select(minimum_structure_cols + property_cols)
for co_batch in co_table.to_batches():
co_dict = co_batch.to_pydict()
for i in range(co_batch.num_rows):
info = {key: co_dict[key][i] for key in property_cols}
atoms = Atoms(
positions=co_dict['positions'][i],
numbers=co_dict['atomic_numbers'][i],
cell=co_dict['cell'][i],
pbc=co_dict['pbc'][i],
info=info
)
yield atoms
configurations = load_configurations_to_atoms(configuration_dir)
# Iterate from generator
for config in configurations:
pass
# Iterate in batches
from itertools import batched
batch_size = 10_000
co_batches = batched(configurations, batch_size)
for co_batch in co_batches:
pass
Load configurations with configuration sets
def pd_row_to_atoms_with_cs_id(row):
"""Convert pd.DataFrame row to ase.Atoms object with configuration_set_id in atoms.info dict"""
atoms = Atoms(
positions=row['positions'].tolist(),
numbers=row['atomic_numbers'],
cell=row['cell'].tolist(),
pbc=row['pbc'],
info={key:row[key] for key in property_cols + ['configuration_set_id']}
)
return atoms
def select_one_configuration_set(configuration_dir, config_set_map_dir, configuration_set_id):
"""Select all configurations from one configuration set and convert to ase.Atoms"""
config_df = pd.read_parquet(configuration_dir)
configuration_cs_map = pd.read_parquet(config_set_map_dir)
mask = configuration_cs_map['configuration_set_id'] == configuration_set_id
configuration_cs_map = configuration_cs_map[mask]
mapped_configs = config_df.merge(configuration_cs_map, on='configuration_id', how='right')
return mapped_configs.apply(pd_row_to_atoms_with_cs_id, axis=1)
configurations = select_one_configuration_set(configuration_dir, config_set_map_dir, 'CS_0x40mfbhvhiu_0')
def load_configurations_to_atoms(configuration_dir, config_set_map_dir):
"""Select configurations from all configuration sets and convert to ase.Atoms with configuration_set_id in atoms.info dict"""
config_df = pd.read_parquet(configuration_dir)
configuration_cs_map = pd.read_parquet(config_set_map_dir)
mapped_configs = config_df.merge(configuration_cs_map, on='configuration_id', how='right')
return mapped_configs.apply(pd_row_to_atoms_with_cs_id, axis=1)
configurations = load_configurations_to_atoms(configuration_dir, config_set_map_dir)
def load_configurations_to_atoms(configuration_dir, config_set_map_dir):
"""Select all configurations from one configuration set and convert to ase.Atoms"""
co_table = pq.read_table(configuration_dir)
cs_co_map_table = pq.read_table(config_set_map_dir)
cs_co_map = dict(zip(cs_co_map_dict['configuration_id'], cs_co_map_dict['configuration_set_id']))
cs_col = pa.array([cs_co_map.get(co_id) for co_id in co_table['configuration_id'].to_pylist()])
co_table = co_table.append_column('configuration_set_id', cs_col)
co_table = co_table.select(minimum_structure_cols + property_cols + ['configuration_set_id'])
for co_batch in co_table.to_batches():
co_dict = co_batch.to_pydict()
for i in range(co_batch.num_rows):
info = {key: co_dict[key][i] for key in property_cols + ['configuration_set_id']}
atoms = Atoms(
positions=co_dict['positions'][i],
numbers=co_dict['atomic_numbers'][i],
cell=co_dict['cell'][i],
pbc=co_dict['pbc'][i],
info=info
)
yield atoms
configurations = load_configurations_to_atoms(configuration_dir, config_set_map_dir)