Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

  1. Write the mapper script
    Code Block
    ~>cat phaseMapper.sh
    #!/bin/sh
    
    RESULT_BUCKET=s3://sagetestsagebio-YourUsernameYourUnixUsername/results
    
    # Send some bogus output to stdout so that mapreduce does not timeout
    # during phase processing since the phase algorithm does send output
    # to stdout on a regular basis
    perl -e 'while(! -e "./timetostop") { print "keepalive\n"; print STDERR "reporter:status:keepalive\n"; sleep 300; }' &
    
    while read S3_INPUT_FILE; do
        echo input to process ${S3_INPUT_FILE} 1>&2
    
        # For debugging purposes, print out the files cached for us
        ls -la 1>&2
    
        # Parse the s3 file path to get the file name
        LOCAL_INPUT_FILE=$(echo ${S3_INPUT_FILE} | perl -pe 'if (/^((s3[n]?):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$/) {print "$6\n"};' | head -1)
    
        # Download the file from S3
        echo hadoop fs -get ${S3_INPUT_FILE} ${LOCAL_INPUT_FILE} 1>&2
        hadoop fs -get ${S3_INPUT_FILE} ${LOCAL_INPUT_FILE} 1>&2
    
        # Run phase processing
        ./phase ${LOCAL_INPUT_FILE} ${LOCAL_INPUT_FILE}_out 100 1 100
    
        # Upload the output files
        ls -la ${LOCAL_INPUT_FILE}*_out* 1>&2
        for f in ${LOCAL_INPUT_FILE}*_out*
        do
            echo hadoop fs -put $f ${RESULT_BUCKET}/$LOCAL_INPUT_FILE/$f 1>&2
            hadoop fs -put $f ${RESULT_BUCKET}/$LOCAL_INPUT_FILE/$f 1>&2
        done
        echo processed ${S3_INPUT_FILE} 1>&2
        echo 1>&2
        echo 1>&2
    done
    
    # Tell our background keepalive task to exit
    touch ./timetostop
    
    exit 0
    
  2. Upload the mapper script to S3 via the AWS console or s3curl
    Code Block
    /work/platform/bin/s3curl.pl --id $USER --put phaseMapper.sh https://s3.amazonaws.com/sagetest-$USER/scripts/phaseMapper.sh
    
  3. Upload the phase binary to S3 too
    Code Block
    /work/platform/bin/s3curl.pl --id $USER --put PHASE https://s3.amazonaws.com/sagetest-$USER/scripts/phase
    

...

  1. Write your input file
    Code Block
    ~>cat phaseInput.txt
    s3://sagetestsagebio-YourUsernameYourUnixUsername/input/ProSM_chrom_MT.phase.inp
    ... many more files, one per chromosome
    
  2. Upload your input file to S3 via the AWS console or s3curl
    Code Block
    /work/platform/bin/s3curl.pl --id $USER --put phaseInput.txt https://s3.amazonaws.com/sagetest-$USER/input/phaseInput.txt
    
  3. Also upload all the data files referenced in phaseInput.txt to the location specified in that file.

...

  1. Write your job configuration. Note that you need to change the output location each time you run this!
    Code Block
    ~>cat phase.json
    [
        {
            "Name": "MapReduce Step 1: Run Phase",
            "ActionOnFailure": "CANCEL_AND_WAIT",
            "HadoopJarStep": {
                "Jar": "/home/hadoop/contrib/streaming/hadoop-streaming.jar",
                    "Args": [
                        "-input",     "s3n://sagetestsagebio-YourUsernameYourUnixUsername/input/phaseInput.txt",
                        "-output",    "s3n://sagetestsagebio-YourUsernameYourUnixUsername/output/phaseTry1",
                        "-mapper",    "s3n://sagetestsagebio-YourUsernameYourUnixUsername/scripts/phaseMapper.sh",
                        "-cacheFile", "s3n://sagetestsagebio-YourUsernameYourUnixUsername/scripts/phase#phase",
                        "-jobconf",   "mapred.reduce.tasks=0",
                        "-jobconf",   "mapred.task.timeout=604800000",
                    ]
                }
        }
    ]
    
  2. Put it on one of the shared servers sodo/ballard/belltown.

...

  1. ssh to one of the shared servers sodo/ballard/belltown
  2. Kick of the Elastic Map Reduce Job. This will start 14 hosts: one for the master and 13 for the slaves running the map tasks.
    Code Block
    ~>/work/platform/bin/elastic-mapreduce-cli/elastic-mapreduce --credentials ~/.ssh/$USER-credentials.json --create \
    --enable-debugging --bootstrap-action s3://elasticmapreduce/bootstrap-actions/configurations/latest/memory-intensive \
    --master-instance-type=m1.small --slave-instance-type=c1.medium --num-instances=14 --json phase.json --name phaseTry1
    
    Created job flow j-GA47B7VD991Q
    

...

Code Block
~>/work/platform/bin/elastic-mapreduce-cli/elastic-mapreduce --credentials
~/.ssh/$USER-credentials.json --list --jobflow j-GA47B7VD991Q

j-GA47B7VD991Q     RUNNING        ec2-174-129-134-200.compute-1.amazonaws.com       filesysTry1
   RUNNING        MapReduce Step 1: Run Phase

...

Code Block
~>/work/platform/bin/elastic-mapreduce-cli/elastic-mapreduce --credentials ~/.ssh/$USER-credentials.json --json phase.json --jobflow j-GA47B7VD991Q
Added jobflow steps

...