How to use parquet file downloads

How to use parquet file downloads

The following are examples only, and may need to be adapted to your specific use case
The examples show how to read a dataset's parquet files into ASE Atoms objects or pymatgen Structure or Molecule objects.

Contents


Download and extract

  • Download desired dataset file using Download Dataset Parquet Files button on main search results page or dataset details page.
  • This will download a tar.gz file in the format <dataset_id>.tar.gz.
  • Using your command line, navigate to the containing directory and execute tar -xf <dataset_id>.tar.gz.
  • A directory called <dataset_id> will be extracted (e.g. DS_a1bc2z345).
  • This directory will contain these non-optional items:
    • <dataset_id>/co/ — A directory of configuration files at containing one or more parquet files, formatted as co_0.parquet co_1.parquet ...
    • <dataset_id>/ds.parquet — A file containing dataset details and aggregated information.
  • and may contain these optional items:
    • <dataset_id>/cs/ — A directory containing one or more parquet files of configuration sets, formatted as cs_0.parquet cs_1.parquet ...
    • <dataset_id>/cs_co_map/ — A directory containing one or more parquet files of configuration set to configuration mapping, formatted as cs_co_map_0.parquet cs_co_map_1.parquet ...

See the file schema pages for a full list of column definitions.


Initial Python setup

from pathlib import Path

dataset_dir = Path('path/to/dataset/directory/&lt;dataset_id&gt;')

dataset_info_file = dataset_dir / 'ds.parquet'
configuration_dir = dataset_dir / 'co'
configuration_set_dir = dataset_dir / 'cs'  # If present
config_set_map_dir = dataset_dir / 'cs_co_map'  # If present

minimum_structure_cols = [
    'positions',
    'cell',
    'pbc',
    'atomic_numbers',
]

property_cols = [
    'adsorption_energy',
    'atomic_forces',
    'atomization_energy',
    'cauchy_stress',
    'electronic_band_gap',
    'energy',
    'energy_above_hull',
    'formation_energy',
]

Load dataset information

import pandas as pd

dataset_info = pd.read_parquet(dataset_info_file)
print(dataset_info.head())
import pandas as pd

dataset_info = pd.read_parquet(dataset_info_file)
print(dataset_info.head())
import pyarrow.parquet as pq

dataset_info = pq.read_table(dataset_info_file)
print(dataset_info.to_pydict())
import pyarrow.parquet as pq

dataset_info = pq.read_table(dataset_info_file)
print(dataset_info.to_pydict())

Load configurations

Configurations are the data points of the dataset. Each configuration row contains a structure and its associated calculated properties.

from ase.atoms import Atoms

def load_configurations_to_atoms(configuration_dir):
    co_table = pq.read_table(configuration_dir)
    co_table = co_table.select(minimum_structure_cols + property_cols)
    for co_batch in co_table.to_batches():
        co_dict = co_batch.to_pydict()
        for i in range(co_batch.num_rows):
            info = {key: co_dict[key][i] for key in property_cols}
            atoms = Atoms(
                positions=co_dict['positions'][i],
                numbers=co_dict['atomic_numbers'][i],
                cell=co_dict['cell'][i],
                pbc=co_dict['pbc'][i],
                info=info
            )
            yield atoms

configurations = load_configurations_to_atoms(configuration_dir)

# Iterate from generator
for config in configurations:
    pass

# Iterate in batches
from itertools import batched

batch_size = 10_000
co_batches = batched(configurations, batch_size)
for co_batch in co_batches:
    pass
from pymatgen.core import Structure, Molecule, Lattice

def pd_row_to_pymatgen(row):
    species =row['atomic_numbers']
    if all([x == 0 for y in row['cell'] for x in y]):
        return Molecule(coords=row['positions'].tolist(),
            species=row['atomic_numbers'],
            properties={key:row[key] for key in property_cols}
        )
    return Structure(
            coords=row['positions'].tolist(),
            species=species,
            lattice=Lattice(row['cell'].tolist(), row['pbc']),
            coords_are_cartesian=True,
            properties={key:row[key] for key in property_cols}
        )
    

def load_configurations_to_structures(configuration_dir):
    config_df = pd.read_parquet(configuration_dir)
    config_df = config_df[minimum_structure_cols + property_cols]
    return config_df.apply(pd_row_to_pymatgen, axis=1)

configurations = load_configurations_to_structures(configuration_dir)
from ase.atoms import Atoms

def load_configurations_to_atoms(configuration_dir):
    co_table = pq.read_table(configuration_dir)
    co_table = co_table.select(minimum_structure_cols + property_cols)
    for co_batch in co_table.to_batches():
        co_dict = co_batch.to_pydict()
        for i in range(co_batch.num_rows):
            info = {key: co_dict[key][i] for key in property_cols}
            atoms = Atoms(
                positions=co_dict['positions'][i],
                numbers=co_dict['atomic_numbers'][i],
                cell=co_dict['cell'][i],
                pbc=co_dict['pbc'][i],
                info=info
            )
            yield atoms

configurations = load_configurations_to_atoms(configuration_dir)

# Iterate from generator
for config in configurations:
    pass

# Iterate in batches
from itertools import batched

batch_size = 10_000
co_batches = batched(configurations, batch_size)
for co_batch in co_batches:
    pass
from pymatgen.core import Structure, Molecule, Lattice

def load_configurations_to_pymatgen(configuration_dir):
    co_table = pq.read_table(configuration_dir)
    co_table = co_table.select(minimum_structure_cols + property_cols)
    for co_batch in co_table.to_batches():
        co_dict = co_batch.to_pydict()
        print(co_dict['cell'][0])
        for i in range(co_batch.num_rows):
            if all([x == 0 for y in co_dict['cell'][i] for x in y]):
                yield Molecule(
                    coords=co_dict['positions'][i],
                    species=co_dict['atomic_numbers'][i],
                    properties={key:co_dict[key] for key in property_cols}
                )
                    
            else:
                yield Structure(
                    coords=co_dict['positions'][i],
                    species=co_dict['atomic_numbers'][i],
                    lattice=Lattice(co_dict['cell'][i], co_dict['pbc'][i]),
                    coords_are_cartesian=True,
                    properties={key:co_dict[key] for key in property_cols}
            )


configurations = load_configurations_to_pymatgen(configuration_dir)

# Iterate from generator
for config in configurations:
    pass

# Iterate in batches
from itertools import batched

batch_size = 10_000
co_batches = batched(configurations, batch_size)
for co_batch in co_batches:
    pass

Load configurations with configuration sets

def pd_row_to_atoms_with_cs_id(row):
    """Convert pd.DataFrame row to ase.Atoms object with configuration_set_id in atoms.info dict"""
    atoms = Atoms(
        positions=row['positions'].tolist(),
        numbers=row['atomic_numbers'],
        cell=row['cell'].tolist(),
        pbc=row['pbc'],
        info={key:row[key] for key in property_cols + ['configuration_set_id']}
    )
    return atoms


def select_one_configuration_set(configuration_dir, config_set_map_dir, configuration_set_id):
    """Select all configurations from one configuration set and convert to ase.Atoms"""
    config_df = pd.read_parquet(configuration_dir)
    configuration_cs_map = pd.read_parquet(config_set_map_dir)
    mask = configuration_cs_map['configuration_set_id'] == configuration_set_id
    configuration_cs_map = configuration_cs_map[mask]
    mapped_configs = config_df.merge(configuration_cs_map, on='configuration_id', how='inner')
    return mapped_configs.apply(pd_row_to_atoms_with_cs_id, axis=1)

configurations = select_one_configuration_set(configuration_dir, config_set_map_dir, 'CS_0x40mfbhvhiu_0')


def load_configurations_to_atoms(configuration_dir, config_set_map_dir):
    """Select configurations from all configuration sets and convert to ase.Atoms with configuration_set_id in atoms.info dict"""
    config_df = pd.read_parquet(configuration_dir)
    configuration_cs_map = pd.read_parquet(config_set_map_dir)
    mapped_configs = config_df.merge(configuration_cs_map, on='configuration_id', how='inner')
    return mapped_configs.apply(pd_row_to_atoms_with_cs_id, axis=1)


configurations = load_configurations_to_atoms(configuration_dir, config_set_map_dir)
def pd_row_to_pymatgen_with_cs_id(row):
    if all([x == 0 for y in row['cell'] for x in y]):
        return Molecule(coords=row['positions'].tolist(),
            species=row['atomic_numbers'],
            properties={key:row[key] for key in property_cols + ['configuration_set_id']}
        )
    return Structure(
            coords=row['positions'].tolist(),
            species=row['atomic_numbers'],
            lattice=Lattice(row['cell'].tolist(), row['pbc']),
            coords_are_cartesian=True,
            properties={key:row[key] for key in property_cols + ['configuration_set_id']}
        )

def select_one_configuration_set(configuration_dir, config_set_map_dir, configuration_set_id):
    """Select all configurations from one configuration set and convert to pymatgen Structure of Molecule"""
    config_df = pd.read_parquet(configuration_dir)
    configuration_cs_map = pd.read_parquet(config_set_map_dir)
    mask = configuration_cs_map['configuration_set_id'] == configuration_set_id
    configuration_cs_map = configuration_cs_map[mask]
    mapped_configs = config_df.merge(configuration_cs_map, on='configuration_id', how='inner')
    return mapped_configs.apply(pd_row_to_pymatgen_with_cs_id, axis=1)

configurations = select_one_configuration_set(configuration_dir, config_set_map_dir, 'CS_0x40mfbhvhiu_0')


def load_configurations_to_atoms(configuration_dir, config_set_map_dir):
    """Select configurations from all configuration sets and convert to ase.Atoms with configuration_set_id in atoms.info dict"""
    config_df = pd.read_parquet(configuration_dir)
    configuration_cs_map = pd.read_parquet(config_set_map_dir)
    mapped_configs = config_df.merge(configuration_cs_map, on='configuration_id', how='inner')
    return mapped_configs.apply(pd_row_to_pymatgen_with_cs_id, axis=1)


configurations = load_configurations_to_atoms(configuration_dir, config_set_map_dir)
import pyarrow as pa

def load_configurations_to_atoms(configuration_dir, config_set_map_dir):
    """Select all configurations from one configuration set and convert to ase.Atoms"""
    co_table = pq.read_table(configuration_dir)
    cs_co_map_table = pq.read_table(config_set_map_dir)
    cs_co_map = dict(zip(cs_co_map_table['configuration_id'], cs_co_map_table['configuration_set_id']))
    cs_col = pa.array([cs_co_map.get(co_id) for co_id in co_table['configuration_id'].to_pylist()])
    co_table = co_table.append_column('configuration_set_id', cs_col)
    co_table = co_table.select(minimum_structure_cols + property_cols + ['configuration_set_id'])
    for co_batch in co_table.to_batches():
        co_dict = co_batch.to_pydict()
        for i in range(co_batch.num_rows):
            info = {key: co_dict[key][i] for key in property_cols + ['configuration_set_id']}
            atoms = Atoms(
                positions=co_dict['positions'][i],
                numbers=co_dict['atomic_numbers'][i],
                cell=co_dict['cell'][i],
                pbc=co_dict['pbc'][i],
                info=info
            )
            yield atoms

configurations = load_configurations_to_atoms(configuration_dir, config_set_map_dir)
import pyarrow as pa

def load_configurations_to_pymatgen(configuration_dir, config_set_map_dir):
    """Select all configurations from one configuration set and convert to pymatgen Structure or Molecule"""
    co_table = pq.read_table(configuration_dir)
    cs_co_map_table = pq.read_table(config_set_map_dir)
    cs_co_map = dict(zip(cs_co_map_table['configuration_id'], cs_co_map_table['configuration_set_id']))
    cs_col = pa.array([cs_co_map.get(co_id) for co_id in co_table['configuration_id'].to_pylist()])
    co_table = co_table.append_column('configuration_set_id', cs_col)
    co_table = co_table.select(minimum_structure_cols + property_cols + ['configuration_set_id'])
    for co_batch in co_table.to_batches():
        co_dict = co_batch.to_pydict()
        for i in range(co_batch.num_rows):
            if all([x == 0 for y in co_dict['cell'][i] for x in y]):
                yield Molecule(
                    coords=co_dict['positions'][i],
                    species=co_dict['atomic_numbers'][i],
                    properties={key:co_dict[key] for key in property_cols + ['configuration_set_id']}
                )
                    
            else:
                yield Structure(
                    coords=co_dict['positions'][i],
                    species=co_dict['atomic_numbers'][i],
                    lattice=Lattice(co_dict['cell'][i], co_dict['pbc'][i]),
                    coords_are_cartesian=True,
                    properties={key:co_dict[key] for key in property_cols + ['configuration_set_id']}
            )

configurations = load_configurations_to_pymatgen(configuration_dir, config_set_map_dir)

Parquet file schemas