Source code for python_pachyderm.mixin.pps

import os
import json
import base64
import warnings
from pathlib import Path

from python_pachyderm.proto.pps import pps_pb2 as pps_proto
from python_pachyderm.service import Service
from .util import commit_from


[docs]class PPSMixin:
[docs] def inspect_job(self, job_id, block_state=None, output_commit=None, full=None): """Inspects a job with a given ID. Returns a ``JobInfo``. Parameters ---------- job_id : str The ID of the job to inspect. block_state : bool, optional If true, block until the job completes. output_commit : Union[tuple, str, Commit protobuf], optional Represents an output commit to filter on. full : bool, optional If true, include worker status. """ return self._req( Service.PPS, "InspectJob", job=pps_proto.Job(id=job_id), block_state=block_state, output_commit=commit_from(output_commit) if output_commit is not None else None, full=full, )
[docs] def list_job( self, pipeline_name=None, input_commit=None, output_commit=None, history=None, full=None, jqFilter=None, ): """.. # noqa: W505 Lists jobs. Yields ``JobInfo`` objects. Parameters ---------- pipeline_name : str, optional A pipeline name to filter on. input_commit : List[Union[tuple, str, Commit protobuf]], optional An optional list representing input commits to filter on. output_commit : Union[tuple, str, Commit protobuf], optional Represents an output commit to filter on. history : int, optional Indicates to return jobs from historical versions of pipelines. Semantics are: - 0: Return jobs from the current version of the pipeline or pipelines. - 1: Return the above and jobs from the next most recent version - 2: etc. - -1: Return jobs from all historical versions. full : bool, optional Whether the result should include all pipeline details in each ``JobInfo``, or limited information including name and status, but excluding information in the pipeline spec. Leaving this ``None`` (or ``False``) can make the call significantly faster in clusters with a large number of pipelines and jobs. Note that if `input_commit` is set, this field is coerced to ``True``. jqFilter : str, optional A ``jq`` filter that can restrict the list of jobs returned. """ if isinstance(input_commit, list): input_commit = [commit_from(ic) for ic in input_commit] elif input_commit is not None: input_commit = [commit_from(input_commit)] return self._req( Service.PPS, "ListJobStream", pipeline=pps_proto.Pipeline(name=pipeline_name) if pipeline_name is not None else None, input_commit=input_commit, output_commit=commit_from(output_commit) if output_commit is not None else None, history=history, full=full, jqFilter=jqFilter, )
[docs] def flush_job(self, commits, pipeline_names=None): """Blocks until all of the jobs which have a set of commits as provenance have finished. Yields ``JobInfo`` objects. Parameters ---------- commits : List[Union[tuple, str, Commit protobuf]] A list representing the commits to flush. pipeline_names : List[str], optional A list of strings specifying pipeline names. If specified, only jobs within these pipelines will be flushed. """ if pipeline_names is not None: to_pipelines = [pps_proto.Pipeline(name=name) for name in pipeline_names] else: to_pipelines = None return self._req( Service.PPS, "FlushJob", commits=[commit_from(c) for c in commits], to_pipelines=to_pipelines, )
[docs] def delete_job(self, job_id): """Deletes a job by its ID. Parameters ---------- job_id : str The ID of the job to delete. """ return self._req(Service.PPS, "DeleteJob", job=pps_proto.Job(id=job_id))
[docs] def stop_job(self, job_id): """Stops a job by its ID. Parameters ---------- job_id : str The ID of the job to stop. """ return self._req(Service.PPS, "StopJob", job=pps_proto.Job(id=job_id))
[docs] def inspect_datum(self, job_id, datum_id): """Inspects a datum. Returns a ``DatumInfo`` object. Parameters ---------- job_id : str The ID of the job. datum_id : str The ID of the datum. """ return self._req( Service.PPS, "InspectDatum", datum=pps_proto.Datum(id=datum_id, job=pps_proto.Job(id=job_id)), )
[docs] def list_datum( self, job_id=None, page_size=None, page=None, input=None, status_only=None ): """Lists datums. Yields ``ListDatumStreamResponse`` objects. Parameters ---------- job_id : str, optional The ID of a job. Exactly one of `job_id` (real) or `input` (hypothetical) must be set. page_size : int, optional The size of the page. page : int, optional The page number. input : Input protobuf, optional If set in lieu of `job_id`, ``list_datum()`` returns the datums that would be given to a hypothetical job that used `input` as its input spec. Exactly one of `job_id` (real) or `input` (hypothetical) must be set. """ return self._req( Service.PPS, "ListDatumStream", job=pps_proto.Job(id=job_id), page_size=page_size, page=page, input=input, status_only=status_only, )
[docs] def restart_datum(self, job_id, data_filters=None): """Restarts a datum. Parameters ---------- job_id : str The ID of the job. data_filters : List[str], optional An optional iterable of strings. """ return self._req( Service.PPS, "RestartDatum", job=pps_proto.Job(id=job_id), data_filters=data_filters, )
[docs] def create_pipeline( self, pipeline_name, transform, parallelism_spec=None, hashtree_spec=None, egress=None, update=None, output_branch=None, resource_requests=None, resource_limits=None, input=None, description=None, cache_size=None, enable_stats=None, reprocess=None, max_queue_size=None, service=None, chunk_spec=None, datum_timeout=None, job_timeout=None, salt=None, standby=None, datum_tries=None, scheduling_spec=None, pod_patch=None, spout=None, spec_commit=None, metadata=None, s3_out=None, sidecar_resource_limits=None, reprocess_spec=None, autoscaling=None, ): """ Creates a pipeline. For more info, please refer to the pipeline spec document: http://docs.pachyderm.io/en/latest/reference/pipeline_spec.html Parameters ---------- pipeline_name : str The pipeline name. transform : Transform protobuf A ``Transform`` object. parallelism_spec : ParallelismSpec protobuf, optional An optional ``ParallelismSpec`` object. hashtree_spec : HashtreeSpec protobuf, optional An optional ``HashtreeSpec`` object. egress : Egress protobuf, optional An optional ``Egress`` object. update : bool, optional Whether this should behave as an upsert. output_branch : str, optional The branch to output results on. resource_requests : ResourceSpec protobuf, optional An optional ``ResourceSpec`` object. resource_limits : ResourceSpec protobuf, optional An optional ``ResourceSpec`` object. input : Input protobuf, optional An optional ``Input`` object. description : str, optional Description of the pipeline. cache_size : str, optional An optional string. enable_stats : bool, optional An optional bool. reprocess : bool, optional If true, Pachyderm forces the pipeline to reprocess all datums. It only has meaning if `update` is ``True``. max_queue_size : int, optional An optional int. service : Service protobuf, optional An optional ``Service`` object. chunk_spec : ChunkSpec protobuf, optional An optional ``ChunkSpec`` object. datum_timeout : Duration protobuf, optional An optional ``Duration`` object. job_timeout : Duration protobuf, optional An optional ``Duration`` object. salt : str, optional An optional string. standby : bool, optional An optional bool. datum_tries : int, optional An optional int. scheduling_spec : SchedulingSpec protobuf, optional An optional ``SchedulingSpec`` object. pod_patch : str, optional An optional string. spout : Spout protobuf, optional An optional ``Spout`` object. spec_commit : Commit protobuf, optional An optional ``Commit`` object. metadata : Metadata protobuf, optional An optional ``Metadata`` object. s3_out : bool, optional Unused. sidecar_resource_limits : ResourceSpec protobuf, optional An optional ``ResourceSpec`` setting resource limits for the pipeline sidecar. """ # Support for build step-enabled pipelines. This is a python port of # the equivalent functionality in pachyderm core's # 'src/server/pps/cmds/cmds.go', and any changes made here likely have # to be reflected there as well. if transform.build.image or transform.build.language or transform.build.path: if spout: raise Exception("build step-enabled pipelines do not work with spouts") if not input: raise Exception("no `input` specified") if (not transform.build.language) and (not transform.build.image): raise Exception("must specify either a build `language` or `image`") if transform.build.language and transform.build.image: raise Exception("cannot specify both a build `language` and `image`") if any( i.pfs is not None and i.pfs.name in ("build", "source") for i in pipeline_inputs(input) ): raise Exception( "build step-enabled pipelines cannot have inputs with the name " + "'build' or 'source', as they are reserved for build assets" ) build_path = Path(transform.build.path or ".") if not build_path.exists(): raise Exception("build path {} does not exist".format(build_path)) if (build_path / ".pachignore").exists(): warnings.warn( "detected a '.pachignore' file, but it's unsupported by python_pachyderm -- use `pachctl` instead", RuntimeWarning, ) build_pipeline_name = "{}_build".format(pipeline_name) image = transform.build.image if not image: version = self.get_remote_version() version_str = "{}.{}.{}{}".format( version.major, version.minor, version.micro, version.additional ) image = "pachyderm/{}-build:{}".format( transform.build.language, version_str ) if not transform.image: transform.image = image def create_build_pipeline_input(name): return pps_proto.Input( pfs=pps_proto.PFSInput( name=name, glob="/", repo=build_pipeline_name, branch=name, ) ) self.create_repo(build_pipeline_name, update=True) self._req( Service.PPS, "CreatePipeline", pipeline=pps_proto.Pipeline(name=build_pipeline_name), transform=pps_proto.Transform(image=image, cmd=["sh", "./build.sh"]), parallelism_spec=pps_proto.ParallelismSpec(constant=1), input=create_build_pipeline_input("source"), output_branch="build", update=update, ) with self.put_file_client() as pfc: if update: pfc.delete_file((build_pipeline_name, "source"), "/") for root, _, filenames in os.walk(str(build_path)): for filename in filenames: source_filepath = os.path.join(root, filename) dest_filepath = os.path.join( "/", os.path.relpath(source_filepath, start=str(build_path)) ) pfc.put_file_from_filepath( (build_pipeline_name, "source"), dest_filepath, source_filepath, ) input = pps_proto.Input( cross=[ create_build_pipeline_input("source"), create_build_pipeline_input("build"), input, ] ) if not transform.cmd: transform.cmd[:] = ["sh", "/pfs/build/run.sh"] return self._req( Service.PPS, "CreatePipeline", pipeline=pps_proto.Pipeline(name=pipeline_name), transform=transform, parallelism_spec=parallelism_spec, hashtree_spec=hashtree_spec, egress=egress, update=update, output_branch=output_branch, resource_requests=resource_requests, resource_limits=resource_limits, input=input, description=description, cache_size=cache_size, enable_stats=enable_stats, reprocess=reprocess, max_queue_size=max_queue_size, metadata=metadata, service=service, chunk_spec=chunk_spec, datum_timeout=datum_timeout, job_timeout=job_timeout, salt=salt, standby=standby, datum_tries=datum_tries, scheduling_spec=scheduling_spec, pod_patch=pod_patch, spout=spout, spec_commit=spec_commit, sidecar_resource_limits=sidecar_resource_limits, reprocess_spec=reprocess_spec, autoscaling=autoscaling, )
[docs] def create_pipeline_from_request(self, req): """Creates a pipeline from a ``CreatePipelineRequest`` object. Usually this would be used in conjunction with ``util.parse_json_pipeline_spec()`` or ``util.parse_dict_pipeline_spec()``. If you're in pure python and not working with a pipeline spec file, the sibling method ``create_pipeline()`` is more ergonomic. Parameters ---------- req : CreatePipelineRequest protobuf A `CreatePipelineRequest` object. """ return self._req(Service.PPS, "CreatePipeline", req=req)
[docs] def create_tf_job_pipeline( self, pipeline_name, tf_job, parallelism_spec=None, hashtree_spec=None, egress=None, update=None, output_branch=None, scale_down_threshold=None, resource_requests=None, resource_limits=None, input=None, description=None, cache_size=None, enable_stats=None, reprocess=None, max_queue_size=None, service=None, chunk_spec=None, datum_timeout=None, job_timeout=None, salt=None, standby=None, datum_tries=None, scheduling_spec=None, pod_patch=None, spout=None, spec_commit=None, ): """Creates a pipeline. For more info, please refer to the pipeline spec document: http://docs.pachyderm.io/en/latest/reference/pipeline_spec.html Parameters ---------- pipeline_name : str The pipeline name. tf_job : TFJob protobuf Pachyderm uses this to create TFJobs when running in a Kubernetes cluster on which kubeflow has been installed. parallelism_spec : ParallelismSpec protobuf, optional An optional ``ParallelismSpec`` object. hashtree_spec : HashtreeSpec protobuf, optional An optional ``HashtreeSpec`` object. egress : Egress protobuf, optional An optional ``Egress`` object. update : bool, optional Whether this should behave as an upsert. output_branch : str, optional The branch to output results on. scale_down_threshold : Duration protobuf, optional An optional `Duration` object. resource_requests : ResourceSpec protobuf, optional An optional ``ResourceSpec`` object. resource_limits : ResourceSpec protobuf, optional An optional ``ResourceSpec`` object. input : Input protobuf, optional An optional ``Input`` object. description : str, optional Description of the pipeline. cache_size : str, optional An optional string. enable_stats : bool, optional An optional bool. reprocess : bool, optional If true, Pachyderm forces the pipeline to reprocess all datums. It only has meaning if `update` is ``True``. max_queue_size : int, optional An optional int. service : Service protobuf, optional An optional ``Service`` object. chunk_spec : ChunkSpec protobuf, optional An optional ``ChunkSpec`` object. datum_timeout : Duration protobuf, optional An optional ``Duration`` object. job_timeout : Duration protobuf, optional An optional ``Duration`` object. salt : str, optional An optional string. standby : bool, optional An optional bool. datum_tries : int, optional An optional int. scheduling_spec : SchedulingSpec protobuf, optional An optional ``SchedulingSpec`` object. pod_patch : str, optional An optional string. spout : Spout protobuf, optional An optional ``Spout`` object. spec_commit : Commit protobuf, optional An optional ``Commit`` object. """ return self._req( Service.PPS, "CreatePipeline", pipeline=pps_proto.Pipeline(name=pipeline_name), tf_job=tf_job, parallelism_spec=parallelism_spec, hashtree_spec=hashtree_spec, egress=egress, update=update, output_branch=output_branch, scale_down_threshold=scale_down_threshold, resource_requests=resource_requests, resource_limits=resource_limits, input=input, description=description, cache_size=cache_size, enable_stats=enable_stats, reprocess=reprocess, max_queue_size=max_queue_size, service=service, chunk_spec=chunk_spec, datum_timeout=datum_timeout, job_timeout=job_timeout, salt=salt, standby=standby, datum_tries=datum_tries, scheduling_spec=scheduling_spec, pod_patch=pod_patch, spout=spout, spec_commit=spec_commit, )
[docs] def inspect_pipeline(self, pipeline_name, history=None): """.. # noqa: W505 Inspects a pipeline. Returns a ``PipelineInfo`` object. Parameters ---------- pipeline_name : str The pipeline name. history : int, optional Indicates to return historical versions of pipelines. Semantics are: - 0: Return current version of pipelines. - 1: Return the above and pipelines from the next most recent version. - 2: etc. - -1: Return pipelines from all historical versions. """ pipeline = pps_proto.Pipeline(name=pipeline_name) if history is None: return self._req(Service.PPS, "InspectPipeline", pipeline=pipeline) else: # `InspectPipeline` doesn't support history, but `ListPipeline` # with a pipeline filter does, so we use that here pipelines = self._req( Service.PPS, "ListPipeline", pipeline=pipeline, history=history ).pipeline_info assert len(pipelines) <= 1 return pipelines[0] if len(pipelines) else None
[docs] def list_pipeline(self, history=None, allow_incomplete=None, jqFilter=None): """.. # noqa: W505 Lists pipelines. Returns a `PipelineInfos` object. Parameters ---------- history : int, optional Indicates to return historical versions of pipelines. Semantics are: - 0: Return current version of pipelines. - 1: Return the above and pipelines from the next most recent version. - 2: etc. - -1: Return pipelines from all historical versions. allow_incomplete : bool, optional If True, causes ``list_pipeline()`` to return ``PipelineInfos`` with incomplete data where the pipeline spec cannot be retrieved. Incomplete ``PipelineInfos`` will have a ``None`` `Transform` field, but will have the fields present in ``EtcdPipelineInfo``. jqFilter : str, optional A ``jq`` filter that can restrict the list of pipelines returned. """ return self._req( Service.PPS, "ListPipeline", history=history, allow_incomplete=allow_incomplete, jqFilter=jqFilter, )
[docs] def delete_pipeline( self, pipeline_name, force=None, keep_repo=None, split_transaction=None ): """Deletes a pipeline. Parameters ---------- pipeline_name : str The pipeline name. force : bool, optional Whether to force delete. keep_repo : bool, optional Whether to keep the output repo. split_transaction : bool, optional Whether Pachyderm attempts to delete the pipeline in a single database transaction. Setting this to ``True`` can work around certain Pachyderm errors, but, if set, the ``delete_repo()` call may need to be retried. """ return self._req( Service.PPS, "DeletePipeline", pipeline=pps_proto.Pipeline(name=pipeline_name), force=force, keep_repo=keep_repo, split_transaction=split_transaction, )
[docs] def delete_all_pipelines(self, force=None): """Deletes all pipelines. Parameters ---------- force : bool, optional Whether to force delete. """ return self._req(Service.PPS, "DeletePipeline", all=True, force=force)
[docs] def start_pipeline(self, pipeline_name): """Starts a pipeline. Parameters ---------- pipeline_name : str The pipeline name. """ return self._req( Service.PPS, "StartPipeline", pipeline=pps_proto.Pipeline(name=pipeline_name), )
[docs] def stop_pipeline(self, pipeline_name): """Stops a pipeline. Parameters ---------- pipeline_name : str The pipeline name. """ return self._req( Service.PPS, "StopPipeline", pipeline=pps_proto.Pipeline(name=pipeline_name) )
[docs] def run_pipeline(self, pipeline_name, provenance=None, job_id=None): """Runs a pipeline. Parameters ---------- pipeline_name : str The pipeline name. provenance : List[CommitProvenance protobuf], optional A list representing the pipeline execution provenance. job_id : str, optional A specific job ID to run. """ return self._req( Service.PPS, "RunPipeline", pipeline=pps_proto.Pipeline(name=pipeline_name), provenance=provenance, job_id=job_id, )
[docs] def run_cron(self, pipeline_name): """Explicitly triggers a pipeline with one or more cron inputs to run now. Parameters ---------- pipeline_name : str The pipeline name. """ return self._req( Service.PPS, "RunCron", pipeline=pps_proto.Pipeline(name=pipeline_name), )
[docs] def create_secret(self, secret_name, data, labels=None, annotations=None): """Creates a new secret. Parameters ---------- secret_name : str The name of the secret to create. data : Dict[str, Union[str, bytes]] The data to store in the secret. Each key must consist of alphanumeric characters ``-``, ``_`` or ``.``. labels : Dict[str, str], optional Kubernetes labels to attach to the secret. annotations : Dict[str, str], optional Kubernetes annotations to attach to the secret. """ encoded_data = {} for k, v in data.items(): if isinstance(v, str): v = v.encode("utf8") encoded_data[k] = base64.b64encode(v).decode("utf8") f = json.dumps( { "kind": "Secret", "apiVersion": "v1", "metadata": { "name": secret_name, "labels": labels, "annotations": annotations, }, "data": encoded_data, } ).encode("utf8") return self._req(Service.PPS, "CreateSecret", file=f)
[docs] def delete_secret(self, secret_name): """Deletes a secret. Parameters ---------- secret_name : str The name of the secret to delete. """ secret = pps_proto.Secret(name=secret_name) return self._req(Service.PPS, "DeleteSecret", secret=secret)
[docs] def list_secret(self): """Lists secrets. Returns a list of ``SecretInfo`` objects.""" return self._req( Service.PPS, "ListSecret", req=pps_proto.google_dot_protobuf_dot_empty__pb2.Empty(), ).secret_info
[docs] def inspect_secret(self, secret_name): """Inspects a secret. Parameters ---------- secret_name : str The name of the secret to inspect. """ secret = pps_proto.Secret(name=secret_name) return self._req(Service.PPS, "InspectSecret", secret=secret)
[docs] def delete_all(self): """Deletes everything in Pachyderm.""" return self._req( Service.PPS, "DeleteAll", req=pps_proto.google_dot_protobuf_dot_empty__pb2.Empty(), )
[docs] def get_pipeline_logs( self, pipeline_name, data_filters=None, master=None, datum=None, follow=None, tail=None, use_loki_backend=None, since=None, ): """Gets logs for a pipeline. Yields ``LogMessage`` objects. Parameters ---------- pipeline_name : str The name of the pipeline. data_filters : List[str], optional A list of the names of input files from which we want processing logs. This may contain multiple files, in case `pipeline_name` contains multiple inputs. Each filter may be an absolute path of a file within a repo, or it may be a hash for that file (to search for files at specific versions). master : bool, optional If true, includes logs from the master datum : Datum protobuf, optional Filters log lines for the specified datum. follow : bool, optional If true, continue to follow new logs as they appear. tail : int, optional If nonzero, the number of lines from the end of the logs to return. Note: tail applies per container, so you will get `tail` * <number of pods> total lines back. use_loki_backend : bool, optional If true, use loki as a backend, rather than Kubernetes, for fetching logs. Requires a loki-enabled cluster. since : Duration protobuf, optional Specifies how far in the past to return logs from. """ return self._req( Service.PPS, "GetLogs", pipeline=pps_proto.Pipeline(name=pipeline_name), data_filters=data_filters, master=master, datum=datum, follow=follow, tail=tail, use_loki_backend=use_loki_backend, since=since, )
[docs] def get_job_logs( self, job_id, data_filters=None, datum=None, follow=None, tail=None, use_loki_backend=None, since=None, ): """Gets logs for a job. Yields `LogMessage` objects. Parameters ---------- job_id : str The ID of the job. data_filters : List[str], optional A list of the names of input files from which we want processing logs. This may contain multiple files, in case `pipeline_name` contains multiple inputs. Each filter may be an absolute path of a file within a repo, or it may be a hash for that file (to search for files at specific versions). datum : Datum protobuf, optional Filters log lines for the specified datum. follow : bool, optional If true, continue to follow new logs as they appear. tail : int, optional If nonzero, the number of lines from the end of the logs to return. Note: tail applies per container, so you will get `tail` * <number of pods> total lines back. use_loki_backend : bool, optional If true, use loki as a backend, rather than Kubernetes, for fetching logs. Requires a loki-enabled cluster. since : Duration protobuf, optional Specifies how far in the past to return logs from. """ return self._req( Service.PPS, "GetLogs", job=pps_proto.Job(id=job_id), data_filters=data_filters, datum=datum, follow=follow, tail=tail, use_loki_backend=use_loki_backend, since=since, )
[docs] def garbage_collect(self, memory_bytes=None): """Runs garbage collection. Parameters ---------- memory_bytes : int, optional How much memory to use in computing which objects are alive. A larger number will result in more precise garbage collection (at the cost of more memory usage). """ return self._req(Service.PPS, "GarbageCollect", memory_bytes=memory_bytes)
[docs]def pipeline_inputs(root): if root is None: return elif root.cross is not None: for i in root.cross: yield from pipeline_inputs(i) elif root.join is not None: for i in root.join: yield from pipeline_inputs(i) elif root.union is not None: for i in root.union: yield from pipeline_inputs(i) yield root