python - clean up urls and aggregate counts
New here? Learn about Bountify and follow @bountify to get notified of new bounties! x

we get data like this from google analytics

https://www.dropbox.com/s/ozr0iwptd0t4u11/analytics%20all%20web%20site%20data%20pages%2020190930-20191006%20%281%29.csv?dl=0

I would like to pass this csv to python and have it process the file by reducing the output to just urls with IDs and then doing a thoughtful aggregation on the stats from the columns

look at each row to try and get the ID in the url typically 7 digit number at the end of the url before params and # etc..

then for each column we should:
Page views = sum by ID match

Unique page views = sum by ID match

Avg time = AVG by ID Match

Entrances = sum by ID match

Bounce = AVG by ID Match

Exit % = AVG by ID Match

it can output a new csv with just the logic above.any url without a ID in it can be ignored in the csv output

awarded to lmacken
Tags
python3

Crowdsource coding tasks.

1 Solution

Winning solution

#!/usr/bin/env python
"""
https://bountify.co/python-clean-up-urls-and-aggregate-counts
"""

import sys

import pandas as pd
import numpy as np

if len(sys.argv) != 2:
    print(f"Usage: {sys.argv[0]} <CSV file>", file=sys.stderr)


filename = sys.argv[1]
print(f"Processing {filename}")
df = pd.read_csv(sys.argv[1])
df.Pageviews = df.Pageviews.str.replace(",", "")
df.Pageviews = df.Pageviews.str.replace("Pageviews", "0").fillna("0").astype(int)
df["Unique Pageviews"] = (
    df["Unique Pageviews"].str.replace(",", "").fillna("0").astype(int)
)

# look at each row to try and get the ID in the url typically 7 digit number at the end of the url before params and # etc..
df["id"] = df.Page.str.extract(r".*(\d\d\d\d\d\d\d).*")

# Drop rows without ids
df = df.dropna()

# Convert the duration to number of seconds

def time_convert(x):
    h, m, s = map(int, x.split(":"))
    return (h * 60 + m) * 60 + s


# < 1 second is converted into 1 second
df["Avg. Time on Page"] = df["Avg. Time on Page"].str.replace("<", "")
df["Seconds on Page"] = df["Avg. Time on Page"].apply(time_convert)


# Create our new data frame
ndf = pd.DataFrame()

# Page views = sum by ID match
ndf["Pageviews"] = df.groupby("id").Pageviews.sum()

# Unique page views = sum by ID match
ndf["Unique Pageviews"] = df.groupby("id")["Unique Pageviews"].sum()

# Avg time = AVG by ID Match
ndf["Average Seconds on Page"] = df.groupby("id")["Seconds on Page"].mean()

# Entrances = sum by ID match
df.Entrances = df.Entrances.astype(int)
ndf["Entrances"] = df.groupby("id").Entrances.sum()

# Bounce = AVG by ID Match
df["Bounce Rate Float"] = df["Bounce Rate"].str.replace("%", "").astype(float)
ndf["Bounce Average"] = df.groupby("id")["Bounce Rate Float"].mean()

# Exit % = AVG by ID Match
df["% Exit Float"] = df["% Exit"].str.replace("%", "").astype(float)
ndf["Average Exit %"] = df.groupby("id")["% Exit Float"].mean()

out_filename = f"{filename.replace('.csv', '-PROCESSED.csv')}"
ndf.to_csv(out_filename)
print(f"Wrote {out_filename}")
View Timeline