In this example, we are not using MapReduce to its full potential. We are only using it to run jobs in parallel, one job for each chromosome.
Mapper
>cat phaseMapper.sh #!/bin/sh RESULT_BUCKET=s3://sagetest-YourUsername/results while read S3_INPUT_FILE; do echo input to process ${S3_INPUT_FILE} 1>&2 # For debugging purposes, print out the files cached for us ls -la 1>&2 # Parse the s3 file path to get the file name LOCAL_INPUT_FILE=$(echo ${S3_INPUT_FILE} | perl -pe 'if (/^((s3[n]?):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$/) {print "$6\n"};' | head -1) # Download the file from S3 echo hadoop fs -get ${S3_INPUT_FILE} ${LOCAL_INPUT_FILE} 1>&2 hadoop fs -get ${S3_INPUT_FILE} ${LOCAL_INPUT_FILE} 1>&2 # Run phase processing ./phase ${LOCAL_INPUT_FILE} ${LOCAL_INPUT_FILE}_out 100 1 100 # Upload the output files ls -la ${LOCAL_INPUT_FILE}*_out 1>&2 for f in ${LOCAL_INPUT_FILE}*_out do echo hadoop fs -put $f ${RESULT_BUCKET}/$LOCAL_INPUT_FILE/$f 1>&2 hadoop fs -put $f ${RESULT_BUCKET}/$LOCAL_INPUT_FILE/$f 1>&2 done echo processed ${S3_INPUT_FILE} 1>&2 echo 1>&2 echo 1>&2 done exit 0
Upload it to S3 via the AWS console or s3curl
/work/platform/bin/s3curl.pl --id $USER --put phaseMapper.sh https://s3.amazonaws.com/sagetest-$USER/scripts/phaseMapper.sh