Source code for pysmFISH.hdf5_utils

"""
Functions that handle hdf5 files
"""

import h5py
import numpy as np
import glob
from collections import OrderedDict
import pickle
import loompy


[docs]def hdf5_create_preprocessing_file(hybridizations_infos,processing_hyb,hybridization,
                                   analysis_name, hyb_dir,converted_positions,
                                   image_properties):
    
    """ 

    Create the hdf5 file that will contain the preprocessed images
    for one hybridization.
    The preprocessing .hdf5 file must have a .ppf.hdf5 extension
    The file will be created for serial writing of the hdf5 file.
    After creation the file is closed.
    Remember that the 'lzf' compression do not accept extra items
    
    Parameters:
    -----------

    hybridizations_infos: dict 
        Parsed Hybridizations metadata
    processing_hyb: str
        Hybridization to process. The function will create an hdf5 file for the 
        hybridization only
    hybridization: str 
        Hybridization name
    analysis_name: str 
        Analysis name
    hyb_dir: str 
        Path of the dir where to save the file
    converted_positions: dict 
        Parsed positions
    """
    
    compression = 'lzf'
    preprocessing_file_name= analysis_name+'_'+processing_hyb+'.ppf.hdf5'
    preprocessing_file = h5py.File(hyb_dir+preprocessing_file_name, 'w',libver='latest')
    
    # ImagesInfo
    Hyb_cols_num_px=image_properties['HybImageSize']['columns']
    Hyb_rows_num_px=image_properties['HybImageSize']['rows']

    Strip_cols_num_px=image_properties['StripImageSize']['columns']
    Strip_rows_num_px=image_properties['StripImageSize']['rows']

    # Create the groups and the subgroups

    # List of the XY positions
    Positions_list=list(converted_positions[hybridization].keys())

    for gene in hybridizations_infos[hybridization].keys():
            gen_grp=preprocessing_file.create_group(gene)
            Fd_group=gen_grp.create_group('FilteredData')
            preprocessing_file.flush()
            if 'Hyb' in hybridization:

                for pos in Positions_list:
                    Fd_group.create_dataset(str(pos),dtype = np.uint16, \
                        shape=(Hyb_rows_num_px,Hyb_cols_num_px),\
                        compression=compression,\
                        shuffle=True,chunks=(Hyb_rows_num_px,Hyb_cols_num_px))
            else:
                for pos in Positions_list:
                    Fd_group.create_dataset(str(pos),dtype = np.uint16,\
                        shape=(Strip_rows_num_px,Strip_cols_num_px),\
                        compression=compression,\
                        shuffle=True,chunks=(Hyb_rows_num_px,Hyb_cols_num_px))

    preprocessing_file.close()
    preprocessing_file_path=hyb_dir + preprocessing_file_name
    
    return preprocessing_file_path




[docs]def create_loompy_raw_counting(hybridizations_infos,converted_positions,
                        hybridization,flt_rawcnt_config,hyb_dir,
                        processing_hyb,counting_gene_dirs):
    """
    Function used to write the counting results in a loom file

    Parameters:
    -----------
    
    hybridizations_infos: dict 
        Parsed information on the experiment.
    converted_positions: dict 
        coords of the images for all hybridization.
    hybridization: str 
        hybridization processed (ex. Hybridization2) 
    flt_rawcnt_config: dict
        Parsed filtering a raw counting configuration file 
        (Filtering_raw_counting.config.yaml)
    counting_gene_dirs: list 
        List of the directories containing the counts


    """

    # Loompy matrix column attributes
    pos_att = np.arange(0,len(converted_positions[hybridization].keys()))

    # Create dictionaries to store the data that have one array for each image
    # position

    counting_dict = OrderedDict()
    selected_peaks_dict = OrderedDict()
    thr_array_dict = OrderedDict()
    peaks_coords_dict = OrderedDict()
    total_peaks_dict = OrderedDict()
    selected_peaks_int_dict = OrderedDict()


    gene_idx_pos = 0
    gene_idx_list = []
    gene_list = list(hybridizations_infos[hybridization].keys())
    gene_list = [gene for gene in gene_list if gene not in flt_rawcnt_config['skip_genes_counting'] ]
    gene_list = [gene for tag in flt_rawcnt_config['skip_tags_counting'] for gene in gene_list if tag not in gene]

    # Create matrices for loompy layers
    total_counts_mat = np.zeros([len(gene_list),len(converted_positions[hybridization].keys())])
    selected_thr_mat = np.zeros([len(gene_list),len(converted_positions[hybridization].keys())])
    calculated_thr_mat = np.zeros([len(gene_list),len(converted_positions[hybridization].keys())])
    thr_idx_mat = np.zeros([len(gene_list),len(converted_positions[hybridization].keys())])
    stringency_mat = np.zeros([len(gene_list),len(converted_positions[hybridization].keys())])

    array_positions = np.sort(list(converted_positions[hybridization].keys()))

    if gene_list:
        for gene in gene_list:
            gene_idx_list.append(gene+'_'+hybridization)
            counting_gene_dir = [fdir for fdir in counting_gene_dirs if gene in fdir][0]
            counting_files_list = glob.glob(counting_gene_dir+'*.pkl')

            counting_dict[gene] =OrderedDict()

            counting_dict[gene]['selected_peaks'] =OrderedDict()
            counting_dict[gene]['thr_array'] =OrderedDict()
            counting_dict[gene]['peaks_coords'] =OrderedDict()
            counting_dict[gene]['total_peaks'] =OrderedDict()
            counting_dict[gene]['selected_peaks_int'] =OrderedDict()



            # Process the files according to sorted position
            for pos in array_positions:
                counting_file = [cf for cf in counting_files_list if 'pos_'+str(pos) in cf][0]
                countings = pickle.load(open(counting_file,'rb'))
                # pos = np.int(counting_file.split('/')[-1].split('_')[-1].split('.')[0])

                # loompy layers construction
                selected_thr_mat[gene_idx_pos,pos] = countings['selected_thr']
                calculated_thr_mat[gene_idx_pos,pos] = countings['calculated_thr']
                thr_idx_mat[gene_idx_pos,pos] = countings['thr_idx']
                stringency_mat[gene_idx_pos,pos] = countings['stringency']
                if isinstance(countings['selected_peaks'], list):
                    total_counts_mat[gene_idx_pos,pos] = len(countings['selected_peaks'])
                else:
                    total_counts_mat[gene_idx_pos,pos] = 0


                # Dictionaries construction

                counting_dict[gene]['selected_peaks'][pos] = countings['selected_peaks']
                counting_dict[gene]['thr_array'][pos] = countings['thr_array']
                counting_dict[gene]['peaks_coords'][pos] = countings['peaks_coords']
                counting_dict[gene]['total_peaks'][pos] = countings['total_peaks']
                counting_dict[gene]['selected_peaks_int'][pos] = countings['selected_peaks_int']



            gene_idx_pos +=1

    # Create loompy file
    loom_fname = hyb_dir+processing_hyb+'_raw_counting.loom'
    loom_hdl = loompy.create(filename=loom_fname,matrix=total_counts_mat,row_attrs={'genes':np.array(gene_idx_list)},col_attrs={'image_pos':np.array(pos_att)})
    loom_hdl.set_layer(name='selected_thr',matrix=selected_thr_mat)
    loom_hdl.set_layer(name='calculated_thr',matrix=calculated_thr_mat)
    loom_hdl.set_layer(name='thr_idx',matrix=thr_idx_mat)
    loom_hdl.set_layer(name='stringency',matrix=stringency_mat)
    loom_hdl.close()    

    # Add extra data to the loom file
    with h5py.File(loom_fname,'r+') as loom_hdl:
        counting_arrays_grp = loom_hdl.create_group('counting_arrays')
        for gene in counting_dict.keys():
            gene_grp = counting_arrays_grp.create_group(gene+'_'+hybridization)
            selected_peaks_grp = gene_grp.create_group('selected_peaks')
            thr_array_grp = gene_grp.create_group('thr_array')
            peaks_coords_grp = gene_grp.create_group('peaks_coords')
            total_peaks_grp = gene_grp.create_group('total_peaks')
            selected_peaks_int_grp = gene_grp.create_group('selected_peaks_int')
            for pos in array_positions:
                selected_peaks_grp.create_dataset(str(pos), data=counting_dict[gene]['selected_peaks'][pos])
                thr_array_grp.create_dataset(str(pos),data=counting_dict[gene]['thr_array'][pos])
                total_peaks_grp.create_dataset(str(pos),data=counting_dict[gene]['total_peaks'][pos])
                selected_peaks_int_grp.create_dataset(str(pos),data=counting_dict[gene]['selected_peaks_int'][pos])
                pos_sbgrp = peaks_coords_grp.create_group(str(pos))
                if isinstance(counting_dict[gene]['peaks_coords'][pos], list):
                    for idx,peaks in enumerate(counting_dict[gene]['peaks_coords'][pos]):
                        pos_sbgrp.create_dataset(str(idx),data=peaks)
                else:
                    pos_sbgrp.create_dataset(str(idx),data=peaks)