...
- Write the mapper script
Code Block ~>cat phaseMapper.sh #!/bin/sh RESULT_BUCKET=s3://sagetest-YourUsername/results # Send some bogus output to stdout so that mapreduce does not timeout # during phase processing since the phase algorithm does send output # to stdout on a regular basis perl -e 'while(! -e "./timetostop") { print "keepalive\n"; print STDERR "reporter:status:keepalive\n"; sleep 300; }' & while read S3_INPUT_FILE; do echo input to process ${S3_INPUT_FILE} 1>&2 # For debugging purposes, print out the files cached for us ls -la 1>&2 # Parse the s3 file path to get the file name LOCAL_INPUT_FILE=$(echo ${S3_INPUT_FILE} | perl -pe 'if (/^((s3[n]?):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$/) {print "$6\n"};' | head -1) # Download the file from S3 echo hadoop fs -get ${S3_INPUT_FILE} ${LOCAL_INPUT_FILE} 1>&2 hadoop fs -get ${S3_INPUT_FILE} ${LOCAL_INPUT_FILE} 1>&2 # Run phase processing ./phase ${LOCAL_INPUT_FILE} ${LOCAL_INPUT_FILE}_out 100 1 100 # Upload the output files ls -la ${LOCAL_INPUT_FILE}*_out* 1>&2 for f in ${LOCAL_INPUT_FILE}*_out* do echo hadoop fs -put $f ${RESULT_BUCKET}/$LOCAL_INPUT_FILE/$f 1>&2 hadoop fs -put $f ${RESULT_BUCKET}/$LOCAL_INPUT_FILE/$f 1>&2 done echo processed ${S3_INPUT_FILE} 1>&2 echo 1>&2 echo 1>&2 done # Tell our background keepalive task to exit touch ./timetostop exit 0
- Upload the mapper script to S3 via the AWS console or s3curl
Code Block /work/platform/bin/s3curl.pl --id $USER --put phaseMapper.sh https://s3.amazonaws.com/sagetest-$USER/scripts/phaseMapper.sh
- Upload the phase binary to S3 too
Code Block /work/platform/bin/s3curl.pl --id $USER --put PHASE https://s3.amazonaws.com/sagetest-$USER/scripts/phase
...
- Write your job configuration. Note that you need to change the output location each time you run this!
Code Block ~>cat phase.json [ { "Name": "MapReduce Step 1: Run Phase", "ActionOnFailure": "CANCEL_AND_WAIT", "HadoopJarStep": { "Jar": "/home/hadoop/contrib/streaming/hadoop-streaming.jar", "Args": [ "-input", "s3n://sagetest-YourUsername/input/phaseInput.txt", "-output", "s3n://sagetest-YourUsername/output/phaseTry1", "-mapper", "s3n://sagetest-YourUsername/scripts/phaseMapper.sh", "-cacheFile", "s3n://sagetest-YourUsername/scripts/phase#phase", "-jobconf", "mapred.map.tasks=26", "-jobconf", "mapred.reduce.tasks=0", "-jobconf", "mapred.tasktracker.map.tasks.maximum=2task.timeout=604800000", ] } } ]
- Put it on one of the shared servers sodo/ballard/belltown.
...