Content Comparison

...

Raw data means the data which we received directly from source, in our case it Synapse repository project. Synapse is sending access records to s3 with firehose kinesis delivery stream.

For each access record we need to modify the requesturl field calculate new fields like client, client_version etc from existing data according to business logic, Processing of data includes taking raw data and modifying the requesturl field for each recordcalculating new field data.

Now we want to keep processed data at in glue table having a separate storage location of S3 which will be partitioned on basis of timestamp of access record as year, month and day. And want to execute Athena query on it. Data should be stored in Parquet format, Parquet is column based file format and fast file type to read.

The ETL job script would like as below:

Code Block

language	json

"""
This script executed by a Glue job. The job take the access record data from S3 and process it.
 Processed data stored in S3 in a parquet file partitioned by timestamp of record as  year / month / day.
"""

import sys
import datetime
import re
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)

# Script generated for node S3 bucket
S3bucket_node1 = glueContextfrom awsglue.dynamicframe import DynamicFrame

# Get access record from source and create dynamic frame for futher processing
def get_dynamic_frame(connection_type, file_format, source_path, glue_context):
    dynamic_frame = glue_context.create_dynamic_frame.from_options(
        format_options={"multiline": True},
    connection_type="s3",     format="json",
    connection_options={
        "paths": ["s3://dev.log.sagebase.org/accessRecord/"]connection_type=connection_type,
        "recurse": True,
    },
    transformation_ctx="S3bucket_node1",
)

# Script generated for node ApplyMapping
ApplyMapping_node2 = ApplyMapping.apply(
    frame=S3bucket_node1,
    mappings=[format=file_format,
        connection_options={
            ("payload.datepaths", "string", "date", "string"),: [source_path],
          ("payload.requestURL", "string", "requesturlrecurse", "string"),
    : True,
   ("payload.xforwardedFor", "string", "xforwardedfor", "string"),         ("payload.oauthClientIdgroupFiles",: "nullinPartition",
"oauthclientid", "string"),         ("payload.success", "boolean", "successgroupSize",: "boolean1048576"),
        ("payload.sessionId"},
"string", "sessionid", "string"),         ("payload.queryString", "null", "querystring", "string"),
        ("payload.method", "string", "method", "string"),transformation_ctx="dynamic_frame")
    return dynamic_frame


def apply_mapping(dynamic_frame):
    mapped_dynamic_frame =   ("payload.basicAuthUsername", "null", "basicauthusername", "string"),ApplyMapping.apply(
           ("payload.instance", "string", "instance", "string"),frame=dynamic_frame,
         ("payload.stack", "string", "stack", "string"),mappings=[
            ("payload.hostsessionId", "string", "hostSESSION_ID", "string"),
        ("payload.elapseMS", "int", "elapsems", "int"),
 .....
       ("payload.threadId", "int" ],
"threadid", "int"),         ("payload.userId", "int", "userid", "int"),
        ("payload.authenticationMethod", "null", "authenticationmethod", "string"),transformation_ctx="mapped_dynamic_frame")
    return mapped_dynamic_frame


# process the access record
def transform(dynamic_record):
    #transformation
   ("payload.returnObjectId", "null", "returnobjectid", "string"), return dynamic_record


def      ("payload.origin", "null", "origin", "string"),main():
    # Get args  ("payload.via", "null", "via", "string"),and setup environment
    args =    ("payload.vmId", "string", "vmid", "string"),getResolvedOptions(sys.argv,
           ("payload.userAgent", "string", "useragent", "string"),         ("payload.responseStatus", "int", "responsestatus", "int"),         ("timestamp["JOB_NAME", "bigintS3_SOURCE_PATH", "timestampDATABASE_NAME", "bigintTABLE_NAME"]),
    ],
    transformation_ctx="ApplyMapping_node2",
)

def transform(dynamicRecord):sc = SparkContext()
    tempVarglue_context = dynamicRecord["requestURL"].lower(GlueContext(sc)
    if tempVar.startswith("/entity/md5"):spark = glue_context.spark_session
    job    tempVar = "/entity/md5/#"
    elif tempVar.startswith("/evaluation/name"):= Job(glue_context)
        tempVar = "/evaluation/name/#"
    elif tempVar.startswith("/entity/alias"):job.init(args["JOB_NAME"], args)

       tempVardynamic_frame = "/entity/alias/#"
    else:
        tempVar = "/#"
        
    dynamicRecord["requestURL"] = tempVar
    print("Schema for mapped_medicare DynamicFrame:" + tempVarget_dynamic_frame("s3", "json", args["S3_SOURCE_PATH"], glue_context)
    mapped_dynamic_frame = apply_mapping(dynamic_frame)
    return dynamicRecord
    
mapped_record = ApplyMapping_node2transformed_dynamic_frame = mapped_dynamic_frame.map(f = transform)
mapped_record.printSchema()    
    # Script generated for node S3 bucket
S3bucket_node3 = glueContext  Write the processed access records to destination
    write_dynamic_frame = glue_context.write_dynamic_frame.from_optionscatalog(
        frame=mappedtransformed_dynamic_recordframe,
    connection_type="s3"    database=args["DATABASE_NAME"],
    format="praquet",     connectiontable_options={name=args["TABLE_NAME"],
        additional_options={"pathpartitionKeys": "s3://dev.log.sagebase.org/processedAccessRecord/["year",         "compression": "gzip""month", "day"]},
        "partitionKeys": [],transformation_ctx="write_dynamic_frame")

    },job.commit()


if  transformation_ctx="S3bucket_node3",
__name__ == "__main__":
    main()

Challenges

As we deploy every week new stack, ETL job should not reprocess the old or already processed data again.

...

4. How to handle duplicate data.

Proposed solution :

In every stack firehose kinesis should send data in s3 path having release info into it like dev.log.sagebase.org/accessRecord/relaseNuber/year/month/day. And we should use the source for ETL job dev.log.sagebase.org/accessRecord/relaseNuber/ so it process only the data of that release.This way we can avoid reprocessing of data and processed data should be stored into general path without release number e.g dev.log.sagebase.org/processedAccessRecord/year/month/day.
Open for discussion.
Open for discussion. Suggestion As we currently use the processed access record for audit purpose we can schedule our job for every 1 hour The ETL job will be static not created in every release.
Create a GitHub project(Synapse-ETL-Jobs) for our etl job. Add GitHub action to create tag/release which will zip the source code and version it with every merge in develop branch, basically use GitHub as artifactory. In Synapse-Stack-Builder project create a new workflow, which will first download the python script form Syanpse-ETL-Jobs from latest tag and upload it to s3 from where glue job and take it for processing.Then create stack with cloud-formation template that will setup a glue job ,glue table, trigger and required resources.Build should include testing.
It should be configurable parameter in stack builder. it should get triggered every hour.( In future we can find another way which can notify that new data is available for processing).
One way to avoid duplicate data is we can used both source and destination as glue table for job and then use left join to identify the duplicate as below :(for now we are not considering duplicate, in future we can look into it, if it will create problem)

Code Block

stagingdatasource = gc.create_dynamic_frame.from_catalog(
        database="stagingdatabase",
        table_name="staging_source_table",
        transformation_ctx="stagingdatasource")

    targetdatasource = gc.create_dynamic_frame.from_catalog(
        database="targetdatabase",
        redshift_tmp_dir=args["TempDir"],
        table_name="target_table",
        transformation_ctx="targetdatasource")

    columnmapping = ApplyMapping.apply(
        frame=stagingdatasource,
        mappings=[("description", "string", "description", "string"), ("id", "int", "id", "int")],
        transformation_ctx="columnmapping")

    ta = columnmapping.toDF().alias('ta')
    tb = targetdatasource.toDF().alias('tb')

    left_join = ta\
        .join(tb, ta.value == tb.value, how='left')\
        .filter(col('tb.value').isNull())\
        .select('ta.*')

    # Inspect left join
    # left_join.show()

    finaldf = DynamicFrame.fromDF(left_join, gc, "nested")

    gc.write_dynamic_frame.from_catalog(
        frame=finaldf,
        database="targetdatabase",
        redshift_tmp_dir=args["TempDir"],
        table_name="target_table")

...

While sending access record from synapse repository to firehose kinesis we can store both request url and processed url/ or only processed url. And kinesis should use dynamic partitioning on timestamp of access record to store data on s3. Now the data stored is already processed.Create a table from source S3 and Athena can query the data. ( not adequate because we might want to process data further, we need raw data in json format so it is available in readable format as well as processed data in parquet formate to query faster).

Version	Old Version 6	New Version Current
Changes made by	Sandhra Sokhal	Sandhra Sokhal
Saved on	Jan 09, 2023	Mar 22, 2023

Content Comparison

Versions Compared

Key

Challenges

Proposed solution :