Elegant Solutions to Hard Problems

Dataset Management with Hundreds of TB

Brett Smith ✧ brett@curoverse.com

New York Times, August 17, 2014: "For Big-Data Scientists, ‘Janitor Work’ Is Key Hurdle to Insights"
Arvados

About Arvados

  • Born of the Personal Genome Project at Harvard Medical School
  • Servers written primarily in Rails and Go, under the GNU AGPL version 3
  • SDKs available for many languages, under the Apache License 2.0
  • On the Web at arvados.org

Keep

Content-addressable storage

Diagram illustrates that files are uploaded to Keep along with their checksum, then retrieved with that same checksum

Key Properties

  • Immutability
  • Versioning

Keep Manifests

Map data in Keep
to a directory structure

. 92c734cacbbfae6bfc6bcccc85d56235+67108864
  b0bd8ea685e9f07021166939e73b07c3+67108864
  71949ee5193a259059db7928fcc20554+67108864
  4d1bfaf7e14caec8e485fc9c91ee875b+5634552
 0:125064007:lobstr_v3.0.2_hg19_ref.bed
 125064007:81896700:lobstr_v3.0.2_hg19_strinfo.tab
 206960707:437:readme.txt

./lobstr_v3.0.2_hg19_ref ccf1c5ba3caf907c1ff90caede3cc439+67108864
                         0111b55f0503ea92ebfd0d8dabf316d6+67108864
                         d0405fc64ce9f8b20d1140343f891675+67108864 …
 0:365:lobSTR_chromsizes.tab
 365:47109121:lobSTR_mergedref.bed
 47109486:1885053904:lobSTR_ref.fasta …
git
commit ac21f0d45a76294aaca0c0c0fdf06eb72d03368d
Author: Tim Pierce <twp@curoverse.com>
Date:   Wed Nov 12 16:07:15 2014 -0500

    3609: fix "NameError: name 'rcode' is not defined"

commit d5b8652f53bf2ac40cbb7fcd31597d0cd08cac98
Author: Peter Amstutz <peter.amstutz@curoverse.com>
Date:   Tue Nov 4 21:05:00 2014 -0500

    3609: Further improve documentation and code comments.

commit 2fe1e71c5cc17dbf06fd7b1e188fd0279c07d3ca
Author: Peter Amstutz <peter.amstutz@curoverse.com>
Date:   Tue Nov 4 16:28:54 2014 -0500

    3609: Documentation improvements.

Rules

  • Exit code 0
  • Register output in Keep
Docker
REPOSITORY          TAG      IMAGE ID       CREATED       VIRTUAL SIZE
arvados/jobs        latest   780db8d8b8fb   8 days ago    1.498 GB
arvados/base        latest   6596f5cc061d   8 days ago    1.356 GB
arvados/shell       latest   bc900e2dc216   3 weeks ago   1.578 GB
arvados/sso         latest   56d52272d532   3 weeks ago   1.699 GB
arvados/keep        latest   5610f663f809   3 weeks ago   133.9 MB
arvados/workbench   latest   17f47aa40352   3 weeks ago   1.825 GB
arvados/doc         latest   9add80340a15   3 weeks ago   1.478 GB
arvados/compute     latest   1018b8b7f0e3   3 weeks ago   1.817 GB
arvados/slurm       latest   d3a22c0a9429   3 weeks ago   1.534 GB
arvados/api         latest   b48c69866320   3 weeks ago   2.141 GB
arvados/passenger   latest   afe396104582   3 weeks ago   1.673 GB

Crunch

Deployment and dispatch

Arvados API Server

  • Discoverable JSON REST API
  • Resolves symbolic references to content hashes
  • Controls access

Jobs

{
 "repository": "arvados",
 "script": "run-command",
 "script_version": "master",

 "runtime_constraints": {
   "docker_image": "bcosc/lobstr"
 },

 "script_parameters": {
   "command": [
     "bash", "-c", "(lobSTR --p1 $(glob $(dir $(read))/*1.f*) …)"
   ],
   "read": "513a1bdbbc2ac2165a0b84e37ab91e31+10812/tests/",
   "path_to_prefix": "d341a6f1db391a780d694e240e95e475+3805"
 }
}
{
 "repository": "arvados",
 "script": "run-command",
 "script_version": "462fbba4ab742a72a3cf057dc06610a51af6b0f0",
 "supplied_script_version": "master",

 "docker_image_locator": "ea32030ce02eef44d0f922baaf1b4461+4032",
 "runtime_constraints": {
   "docker_image": "bcosc/lobstr"
 },

 "script_parameters": {
   "command": [
     "bash", "-c", "(lobSTR --p1 $(glob $(dir $(read))/*1.f*) …)"
   ],
   "read": "513a1bdbbc2ac2165a0b84e37ab91e31+10812/tests/",
   "path_to_prefix": "d341a6f1db391a780d694e240e95e475+3805"
 }
}

Pipelines

{
 "components": {
  "run_lobSTR": {
    "repository": "arvados",
    "script": "run-command",
    "script_version": "462fbba4ab742a72a3cf057dc06610a51af6b0f0",
    …
  },

  "sort_bam": {
    "repository": "arvados",
    "script": "run-command",
    "script_version": "462fbba4ab742a72a3cf057dc06610a51af6b0f0",

    "script_parameters": {
     "command": [
      "samtools", "sort", "$(glob $(dir $(bam))/*.bam)", "test.sorted"
     ],
     "bam": {
      "output_of": "run_lobSTR"
     }
    },
    …
  }
 }
}
_anonymous_0 run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_50f5cf2ce3c10d74040c3d6b4cb377d2 run_lobSTR 74a58b29f4a42d2108eebc56231bc657+95 74a58b29f4a42d2108eebc56231bc657+95 run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_50f5cf2ce3c10d74040c3d6b4cb377d2->74a58b29f4a42d2108eebc56231bc657+95 output 513a1bdbbc2ac2165a0b84e37ab91e31+10812 lobstr-git-code 513a1bdbbc2ac2165a0b84e37ab91e31+10812->run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_50f5cf2ce3c10d74040c3d6b4cb377d2 read run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_9ff9c757e69bdf0f36327abba6187903 allelotype_run 513a1bdbbc2ac2165a0b84e37ab91e31+10812->run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_9ff9c757e69bdf0f36327abba6187903 illumina d341a6f1db391a780d694e240e95e475+3805 lobSTR v3 ref d341a6f1db391a780d694e240e95e475+3805->run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_50f5cf2ce3c10d74040c3d6b4cb377d2 path_to_prefix d341a6f1db391a780d694e240e95e475+3805->run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_9ff9c757e69bdf0f36327abba6187903 strinfo_tab run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_346478b620ecd9502a1f4c5ae9a97333 sort_bam 74a58b29f4a42d2108eebc56231bc657+95->run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_346478b620ecd9502a1f4c5ae9a97333 bam 69c74a319106fb26cabcaed596259ff4+65 69c74a319106fb26cabcaed596259ff4+65 run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_346478b620ecd9502a1f4c5ae9a97333->69c74a319106fb26cabcaed596259ff4+65 output run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_5ef3b1057f62dcd76e3d7907c0f37dee index_sort 69c74a319106fb26cabcaed596259ff4+65->run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_5ef3b1057f62dcd76e3d7907c0f37dee sorted 2df86012416271405c76d95937298910+140 2df86012416271405c76d95937298910+140 run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_5ef3b1057f62dcd76e3d7907c0f37dee->2df86012416271405c76d95937298910+140 output 2df86012416271405c76d95937298910+140->run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_9ff9c757e69bdf0f36327abba6187903 sortedbam 672983378bb4a641c85f809dd69b0a6f+232 672983378bb4a641c85f809dd69b0a6f+232 run-command_462fbba4ab742a72a3cf057dc06610a51af6b0f0_9ff9c757e69bdf0f36327abba6187903->672983378bb4a641c85f809dd69b0a6f+232 output
_anonymous_0 513a1bdbbc2ac2165a0b84e37ab91e31+10812 lobstr-git-code qr1hi-8i9sb-yw6fow2fvh29mlb bash -c (lobSTR --p1 $(glob $(dir $(read))/*1.f*q*) --p2 $(glob $(dir $(read))/*2.f*q*) -q --index-prefix $(dir $(path_to_prefix))/lobstr_v3.0.2_hg19_ref/lobSTR_ -o test --rg-sample mysample --rg-lib mylibrary) 2014-10-14 15:13:38 UTC 513a1bdbbc2ac2165a0b84e37ab91e31+10812->qr1hi-8i9sb-yw6fow2fvh29mlb read 74a58b29f4a42d2108eebc56231bc657+95 74a58b29f4a42d2108eebc56231bc657+95 qr1hi-8i9sb-yw6fow2fvh29mlb->74a58b29f4a42d2108eebc56231bc657+95 output e1aafd8d0c36cabe546178467ff595f5+85 e1aafd8d0c36cabe546178467ff595f5+85 qr1hi-8i9sb-yw6fow2fvh29mlb->e1aafd8d0c36cabe546178467ff595f5+85 log d341a6f1db391a780d694e240e95e475+3805 lobSTR v3 ref d341a6f1db391a780d694e240e95e475+3805->qr1hi-8i9sb-yw6fow2fvh29mlb path_to_prefix ea32030ce02eef44d0f922baaf1b4461+4032 bcosc/lobstr ea32030ce02eef44d0f922baaf1b4461+4032->qr1hi-8i9sb-yw6fow2fvh29mlb docker_image 462fbba4ab742a72a3cf057dc06610a51af6b0f0 git:462fbba4ab742a72a3cf057dc06610a51af6b0f0 462fbba4ab742a72a3cf057dc06610a51af6b0f0->qr1hi-8i9sb-yw6fow2fvh29mlb script_version

Workbench

Workbench screenshot displaying pipeline provenance graph
Workbench screenshot displaying pipeline inputs table
Workbench screenshot displaying pipeline input selection dialog

The Big Picture

Diagram of the Arvados architecture: data enters Keep, then the API server.  When jobs are submitted, they can create new data in Keep and new jobs.

Benefits

  • Provenance
  • Reproducibility
  • Less janitorial work

Learn more