Source code for datarade.services

"""
These services allow a user to interact with datasets stored in a git-compliant source control repository. This layer
should be treated as the interface to this library. In other words, breaking changes may be introduced at lower levels,
but this layer should remain relatively stable as the library matures.
"""
from typing import Optional

from bcp import DataFile
import yaml

from datarade import models, schemas


[docs]def get_dataset_catalog(repository: str, organization: str, platform: str, project: str = None, branch: 'Optional[str]' = 'master', username: str = None, password: str = None) -> 'models.DatasetCatalog': """ A factory function that provides a DatasetCatalog instance The structure of the files in the dataset catalog should look like this: .. code-block:: none repository | |--- catalog | |--- my_dataset | |--- config.yaml |--- definition.sql |--- my_other_dataset | |--- config.yaml |--- definition.sql The repository can be hosted on Git Hub or on Azure Repos. Multiple branches can be used for managing related dataset catalogs. For instance, you may want to maintain a uat branch and a production branch for managing environments. Or you may want one repo for all of your catalogs, but you want to provide some organization to your datasets. Args: repository: the name of the repository organization: the name of the organization (or user for GitHub) that owns the repository platform: that platform that hosts the repo ['github', 'azure-devops'] project: the name of the project that contains the repository, only used for Azure Repos branch: the branch to use in the repository, defaults to 'master' username: the username with read access to the repository, only used for Azure Repos password: the password with read access to the repository, only used for Azure Repos, can also be the one-time git credentials password that bypasses MFA Returns: a DatasetCatalog instance """ return models.DatasetCatalog(repository=repository, organization=organization, platform=platform, project=project, branch=branch, username=username, password=password)
[docs]def get_dataset_container(driver: str, database_name: str, host: str, port: int = None, schema_name: str = None, username: str = None, password: str = None) -> 'models.DatasetContainer': """ A factory function that provides a DatasetContainer instance Args: driver: the type of database, currently only 'mssql' is supported database_name: name of the database host: the name of the server, including the instance port: the port that the database is listening to on the server schema_name: the name of the schema username: a user with create table and insert permissions on the schema password: the password for the user Returns: a DatasetContainer instance """ database = models.Database(driver=driver, database_name=database_name, host=host, port=port, schema_name=schema_name) return models.DatasetContainer(database=database, username=username, password=password)
[docs]def get_dataset(dataset_catalog: 'models.DatasetCatalog', dataset_name: str) -> 'models.Dataset': """ Returns a datarade Dataset object using the identified configuration in the dataset catalog It collects all of the required files from the dataset catalog repository, puts the contents in a configuration dictionary, passes that dictionary up to the abstract repository for validation, and returns the resulting Dataset instance. Args: dataset_catalog: dataset catalog that contains the dataset dataset_name: the name of the dataset, which is also the name of the directory containing the files in the repository Returns: a Dataset object """ config_yaml = dataset_catalog.git.get_file_contents(f'catalog/{dataset_name}/config.yaml') definition = dataset_catalog.git.get_file_contents(f'catalog/{dataset_name}/definition.sql') dataset_dict = yaml.safe_load(config_yaml) dataset_dict['definition'] = definition dataset_schema = schemas.DatasetSchema() return dataset_schema.load(dataset_dict)
[docs]def write_dataset(dataset: 'models.Dataset', dataset_container: 'models.DatasetContainer', username: str = None, password: str = None): """ Writes the supplied dataset to the dataset container The supplied dataset is exported using the provided credentials. If no credentials are supplied, Windows AD is used for the account running this script. Data is written out to ~/bcp/data and logs are written out to ~/bcp/logs. Data is then imported into the supplied dataset container using credentials in that dataset container. Again, if no credentials were supplied, Windows AD is used. Error records are written out to ~/bcp/data and logs are written out to ~/bcp/logs. On a successful write, the data file is deleted to avoid leaving copies of data behind on the application machine. Args: dataset: the dataset to be written dataset_container: the database to store the dataset in username: a user with select/execute permissions on the source database objects password: the password for the user """ dataset_container.create_table(dataset=dataset) if username is None and dataset.user is not None: username = dataset.user.username data_file = DataFile(delimiter='|~|') source_bcp = dataset.database.bcp(username=username, password=password) source_bcp.dump(query=dataset.definition, output_file=data_file) dataset_container.bcp.load(input_file=data_file, table=dataset_container.database.full_table_name(dataset.name)) data_file.file.unlink()