Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

  1. Write the mapper script
    Code Block
    ~>cat phaseMapper.sh
    #!/bin/sh
    
    RESULT_BUCKET=s3://sagetest-YourUsername/results
    
    # Send some bogus output to stdout so that mapreduce does not timeout
    # during phase processing since the phase algorithm does send output
    # to stdout on a regular basis
    perl -e 'while(! -e "./timetostop") { print "keepalive\n"; print STDERR "reporter:status:keepalive\n"; sleep 300; }' &
    
    while read S3_INPUT_FILE; do
        echo input to process ${S3_INPUT_FILE} 1>&2
    
        # For debugging purposes, print out the files cached for us
        ls -la 1>&2
    
        # Parse the s3 file path to get the file name
        LOCAL_INPUT_FILE=$(echo ${S3_INPUT_FILE} | perl -pe 'if (/^((s3[n]?):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$/) {print "$6\n"};' | head -1)
    
        # Download the file from S3
        echo hadoop fs -get ${S3_INPUT_FILE} ${LOCAL_INPUT_FILE} 1>&2
        hadoop fs -get ${S3_INPUT_FILE} ${LOCAL_INPUT_FILE} 1>&2
    
        # Run phase processing
        ./phase ${LOCAL_INPUT_FILE} ${LOCAL_INPUT_FILE}_out 100 1 100
    
        # Upload the output files
        ls -la ${LOCAL_INPUT_FILE}*_out* 1>&2
        for f in ${LOCAL_INPUT_FILE}*_out*
        do
            echo hadoop fs -put $f ${RESULT_BUCKET}/$LOCAL_INPUT_FILE/$f 1>&2
            hadoop fs -put $f ${RESULT_BUCKET}/$LOCAL_INPUT_FILE/$f 1>&2
        done
        echo processed ${S3_INPUT_FILE} 1>&2
        echo 1>&2
        echo 1>&2
    done
    
    # Tell our background keepalive task to exit
    touch ./timetostop
    
    exit 0
    
  2. Upload the mapper script to S3 via the AWS console or s3curl
    Code Block
    /work/platform/bin/s3curl.pl --id $USER --put phaseMapper.sh https://s3.amazonaws.com/sagetest-$USER/scripts/phaseMapper.sh
    
  3. Upload the phase binary to S3 too
    Code Block
    /work/platform/bin/s3curl.pl --id $USER --put PHASE https://s3.amazonaws.com/sagetest-$USER/scripts/phase
    

...

  1. Write your job configuration. Note that you need to change the output location each time you run this!
    Code Block
    ~>cat phase.json
    [
        {
            "Name": "MapReduce Step 1: Run Phase",
            "ActionOnFailure": "CANCEL_AND_WAIT",
            "HadoopJarStep": {
                "Jar": "/home/hadoop/contrib/streaming/hadoop-streaming.jar",
                    "Args": [
                        "-input",     "s3n://sagetest-YourUsername/input/phaseInput.txt",
                        "-output",    "s3n://sagetest-YourUsername/output/phaseTry1",
                        "-mapper",    "s3n://sagetest-YourUsername/scripts/phaseMapper.sh",
                        "-cacheFile", "s3n://sagetest-YourUsername/scripts/phase#phase",
                        "-jobconf",   "mapred.map.tasks=26",
                        "-jobconf",   "mapred.reduce.tasks=0",
                        "-jobconf",   "mapred.tasktracker.map.tasks.maximum=2task.timeout=604800000",
                    ]
                }
        }
    ]
    
  2. Put it on one of the shared servers sodo/ballard/belltown.

...