How to use parquet file downloads

How to use parquet file downloads

The following are examples only, and may need to be adapted to your specific use case

Contents


Download and extract

  • Download desired dataset file using Download Dataset Parquet Files button on main search results page or dataset details page.
  • This will download a tar.gz file in the format <dataset_id>.tar.gz.
  • Using your command line, navigate to the containing directory and execute tar -xf <dataset_id>.tar.gz.
  • A directory called <dataset_id> will be extracted (e.g. DS_a1bc2z345).
  • This directory will contain these non-optional items:
    • <dataset_id>/co/ — A directory of configuration files at containing one or more parquet files, formatted as co_0.parquet co_1.parquet ...
    • <dataset_id>/ds.parquet — A file containing dataset details and aggregated information.
  • and may contain these optional items:
    • <dataset_id>/cs/ — A directory containing one or more parquet files of configuration sets, formatted as cs_0.parquet cs_1.parquet ...
    • <dataset_id>/cs_co_map/ — A directory containing one or more parquet files of configuration set to configuration mapping, formatted as cs_co_map_0.parquet cs_co_map_1.parquet ...

See the file schema pages for a full list of column definitions.


Initial Python setup

from pathlib import Path

dataset_dir = Path('path/to/dataset/directory/&lt;dataset_id&gt;')

dataset_info_file = dataset_dir / 'ds.parquet'
configuration_dir = dataset_dir / 'co'
configuration_set_dir = dataset_dir / 'cs'  # If present
config_set_map_dir = dataset_dir / 'cs_co_map'  # If present

Load dataset information

import pandas as pd

dataset_info = pd.read_parquet(dataset_info_file)
print(dataset_info.head())
import pyarrow.parquet as pq

dataset_info = pq.read_table(dataset_info_file)
print(dataset_info.to_pydict()[0])

Load configurations

Configurations are the data points of the dataset. Each configuration row contains a structure and its associated calculated properties.

def pd_row_to_atoms(row):
    atoms = Atoms(
        positions=row['positions'].tolist(),
        numbers=row['atomic_numbers'],
        cell=row['cell'].tolist(),
        pbc=row['pbc'],
        info={key:row[key] for key in property_cols}
    )
    return atoms

def load_configurations_to_atoms(configuration_dir):
    config_df = pd.read_parquet(configuration_dir)
    config_df = config_df[minimum_structure_cols + property_cols]
    return config_df.apply(pd_row_to_atoms, axis=1)

configurations = pd.read_parquet(configuration_dir)

# Accessible by index
first_config = configurations[0]
from ase.atoms import Atoms

def load_configurations_to_atoms(configuration_dir):
    co_table = pq.read_table(configuration_dir)
    co_table = co_table.select(minimum_structure_cols + property_cols)
    for co_batch in co_table.to_batches():
        co_dict = co_batch.to_pydict()
        for i in range(co_batch.num_rows):
            info = {key: co_dict[key][i] for key in property_cols}
            atoms = Atoms(
                positions=co_dict['positions'][i],
                numbers=co_dict['atomic_numbers'][i],
                cell=co_dict['cell'][i],
                pbc=co_dict['pbc'][i],
                info=info
            )
            yield atoms

configurations = load_configurations_to_atoms(configuration_dir)

# Iterate from generator
for config in configurations:
    pass

# Iterate in batches
from itertools import batched

batch_size = 10_000
co_batches = batched(configurations, batch_size)
for co_batch in co_batches:
    pass

Load configurations with configuration sets

def pd_row_to_atoms_with_cs_id(row):
    """Convert pd.DataFrame row to ase.Atoms object with configuration_set_id in atoms.info dict"""
    atoms = Atoms(
        positions=row['positions'].tolist(),
        numbers=row['atomic_numbers'],
        cell=row['cell'].tolist(),
        pbc=row['pbc'],
        info={key:row[key] for key in property_cols + ['configuration_set_id']}
    )
    return atoms


def select_one_configuration_set(configuration_dir, config_set_map_dir, configuration_set_id):
    """Select all configurations from one configuration set and convert to ase.Atoms"""
    config_df = pd.read_parquet(configuration_dir)
    configuration_cs_map = pd.read_parquet(config_set_map_dir)
    mask = configuration_cs_map['configuration_set_id'] == configuration_set_id
    configuration_cs_map = configuration_cs_map[mask]
    mapped_configs = config_df.merge(configuration_cs_map, on='configuration_id', how='right')
    return mapped_configs.apply(pd_row_to_atoms_with_cs_id, axis=1)

configurations = select_one_configuration_set(configuration_dir, config_set_map_dir, 'CS_0x40mfbhvhiu_0')


def load_configurations_to_atoms(configuration_dir, config_set_map_dir):
    """Select configurations from all configuration sets and convert to ase.Atoms with configuration_set_id in atoms.info dict"""
    config_df = pd.read_parquet(configuration_dir)
    configuration_cs_map = pd.read_parquet(config_set_map_dir)
    mapped_configs = config_df.merge(configuration_cs_map, on='configuration_id', how='right')
    return mapped_configs.apply(pd_row_to_atoms_with_cs_id, axis=1)


configurations = load_configurations_to_atoms(configuration_dir, config_set_map_dir)
def load_configurations_to_atoms(configuration_dir, config_set_map_dir):
    """Select all configurations from one configuration set and convert to ase.Atoms"""
    co_table = pq.read_table(configuration_dir)
    cs_co_map_table = pq.read_table(config_set_map_dir)
    cs_co_map = dict(zip(cs_co_map_dict['configuration_id'], cs_co_map_dict['configuration_set_id']))
    cs_col = pa.array([cs_co_map.get(co_id) for co_id in co_table['configuration_id'].to_pylist()])
    co_table = co_table.append_column('configuration_set_id', cs_col)
    co_table = co_table.select(minimum_structure_cols + property_cols + ['configuration_set_id'])
    for co_batch in co_table.to_batches():
        co_dict = co_batch.to_pydict()
        for i in range(co_batch.num_rows):
            info = {key: co_dict[key][i] for key in property_cols + ['configuration_set_id']}
            atoms = Atoms(
                positions=co_dict['positions'][i],
                numbers=co_dict['atomic_numbers'][i],
                cell=co_dict['cell'][i],
                pbc=co_dict['pbc'][i],
                info=info
            )
            yield atoms

configurations = load_configurations_to_atoms(configuration_dir, config_set_map_dir)

Parquet file schemas