import random
import datetime
= ['entry', 'product_view', 'add_to_cart', 'check_in', 'sale']
STATES
# Define the transition probabilities between states
= {
TRANSITION_PROBABILITIES 'entry': {'product_view': 0.5, 'check_in': 0.3, 'add_to_cart': 0.2},
'product_view': {'add_to_cart': 0.3, 'check_in': 0.5, 'entry': 0.2},
'add_to_cart': {'check_in': 0.4, 'product_view': 0.3, 'sale': 0.3},
'check_in': {'product_view': 0.4, 'add_to_cart': 0.3, 'sale': 0.3},
'sale': {}
}
def generate_events(user_count, start_date):
for user_id in range(1, user_count + 1):
# Generate a session ID for the user
= str(user_id) + '_' + str(random.randint(100000, 999999))
session_id # Initialize the state and timestamp for the user's first event
= 'entry'
current_state = start_date
current_time while True:
# Generate the next state based on the current state and transition probabilities
= random.choices(list(TRANSITION_PROBABILITIES[current_state].keys()),
next_state list(TRANSITION_PROBABILITIES[current_state].values()))[0]
# Generate a timestamp for the next event
= datetime.timedelta(seconds=random.randint(1, 300))
time_diff = current_time + time_diff
next_time # Generate a product ID for product-related events
= None
product if next_state in ['product_view', 'add_to_cart', 'sale']:
= 'product_' + str(random.randint(1, 100))
product # If the next state is a sale, generate a purchase price
= None
purchase_price if next_state == 'sale':
= round(random.uniform(10, 100), 2)
purchase_price # Yield the event with the user ID, session ID, state, timestamp, product, and purchase price
yield {'user_id': user_id, 'session_id': session_id, 'state': next_state, 'time_stamp': next_time,
'product': product, 'purchase_price': purchase_price}
# If the next state is a sale, stop generating events for the user
if next_state == 'sale':
break
# Otherwise, update the current state and timestamp for the next event
= next_state
current_state = next_time current_time
Goal:
Create Simulated data sets
steps:
- break down simulation into blocks
- simulate each block into a csv
- simulate an event stream
- simulate a graph
- put on s3
- put in deltalake
- process with spark
Block list
Simulated Fake data is one of the most important tools in the Bayesian arsnal.
- it makes us think about the data side of the problem.
- it lets find out about some of the contraints that should go into building the priors.
- it gives us a chorent ground truth to validate our models.
In this case I want to simulate an ecommerce store: the first two objects have to do with demand
- there are different types of seasonality [winter,summer,spring,fall,weekdays,weekend,holiday,none]
- there are different types of trend [up,down,steady,random]
- there are thee relations between products [substitutes, complements, none]
- there is three relation beteen products and brands [inferior, superior, none]
- there is a list of brands
- there is a list of product categories.
- there is a list of products.
generate_brand() id, rv_quality: Normal(quality_mean
, quality_variance
) name product_generator(brand,category): generate a product with fields: id, brand, description, quality_score drawn from the brand’s distribution for each product category: while there are less than three products in the category() for each brand: if there is a product with the brand continue else if random bernulli(p=0.5) > 0.3333 generate a product .
prompt
can you write a python generator that takes parmeters `user_count` for the number of user `start_date` and simulates user events in an ecommerce site. The events are drawn from a markov cain with the following states [entry, product_view, add_to_cart, check_in, sale] each event has a time_stamp, user_id, session_id, and product. The event is one of if the page is a sale there is a purchase price .
# Generate events for 10 users starting from January 1, 2023
= generate_events(user_count=10, start_date=datetime.datetime(2023, 1, 1))
events
# Iterate through the events and print them
for event in events:
print(event)
{'user_id': 1, 'session_id': '1_483682', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 2, 10), 'product': None, 'purchase_price': None}
{'user_id': 1, 'session_id': '1_483682', 'state': 'sale', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 2, 29), 'product': 'product_2', 'purchase_price': 35.74}
{'user_id': 2, 'session_id': '2_433637', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 1, 43), 'product': 'product_64', 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 5, 31), 'product': None, 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 8, 12), 'product': 'product_43', 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'entry', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 11), 'product': None, 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 13, 8), 'product': None, 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'add_to_cart', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 14, 41), 'product': 'product_74', 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 15, 21), 'product': 'product_72', 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 17, 56), 'product': None, 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 18, 51), 'product': 'product_11', 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'add_to_cart', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 22, 32), 'product': 'product_64', 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 26, 57), 'product': None, 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 29, 37), 'product': 'product_56', 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 34), 'product': None, 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'add_to_cart', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 34, 30), 'product': 'product_75', 'purchase_price': None}
{'user_id': 2, 'session_id': '2_433637', 'state': 'sale', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 37, 26), 'product': 'product_33', 'purchase_price': 34.99}
{'user_id': 3, 'session_id': '3_694542', 'state': 'add_to_cart', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 4, 9), 'product': 'product_77', 'purchase_price': None}
{'user_id': 3, 'session_id': '3_694542', 'state': 'sale', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 7, 35), 'product': 'product_79', 'purchase_price': 72.97}
{'user_id': 4, 'session_id': '4_958223', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 2, 10), 'product': None, 'purchase_price': None}
{'user_id': 4, 'session_id': '4_958223', 'state': 'add_to_cart', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 3, 39), 'product': 'product_50', 'purchase_price': None}
{'user_id': 4, 'session_id': '4_958223', 'state': 'sale', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 5, 14), 'product': 'product_90', 'purchase_price': 81.8}
{'user_id': 5, 'session_id': '5_886655', 'state': 'add_to_cart', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 1, 28), 'product': 'product_23', 'purchase_price': None}
{'user_id': 5, 'session_id': '5_886655', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 3, 57), 'product': None, 'purchase_price': None}
{'user_id': 5, 'session_id': '5_886655', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 6, 26), 'product': 'product_97', 'purchase_price': None}
{'user_id': 5, 'session_id': '5_886655', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 9, 52), 'product': None, 'purchase_price': None}
{'user_id': 5, 'session_id': '5_886655', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 13, 55), 'product': 'product_29', 'purchase_price': None}
{'user_id': 5, 'session_id': '5_886655', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 15, 22), 'product': None, 'purchase_price': None}
{'user_id': 5, 'session_id': '5_886655', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 20, 8), 'product': 'product_12', 'purchase_price': None}
{'user_id': 5, 'session_id': '5_886655', 'state': 'entry', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 21, 26), 'product': None, 'purchase_price': None}
{'user_id': 5, 'session_id': '5_886655', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 21, 30), 'product': None, 'purchase_price': None}
{'user_id': 5, 'session_id': '5_886655', 'state': 'sale', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 24, 20), 'product': 'product_82', 'purchase_price': 19.43}
{'user_id': 6, 'session_id': '6_794128', 'state': 'add_to_cart', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 1, 30), 'product': 'product_22', 'purchase_price': None}
{'user_id': 6, 'session_id': '6_794128', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 3, 46), 'product': None, 'purchase_price': None}
{'user_id': 6, 'session_id': '6_794128', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 4, 59), 'product': 'product_53', 'purchase_price': None}
{'user_id': 6, 'session_id': '6_794128', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 5, 4), 'product': None, 'purchase_price': None}
{'user_id': 6, 'session_id': '6_794128', 'state': 'sale', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 7, 3), 'product': 'product_65', 'purchase_price': 21.22}
{'user_id': 7, 'session_id': '7_197333', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 1, 19), 'product': 'product_84', 'purchase_price': None}
{'user_id': 7, 'session_id': '7_197333', 'state': 'entry', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 2, 31), 'product': None, 'purchase_price': None}
{'user_id': 7, 'session_id': '7_197333', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 6, 59), 'product': None, 'purchase_price': None}
{'user_id': 7, 'session_id': '7_197333', 'state': 'add_to_cart', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 10, 32), 'product': 'product_67', 'purchase_price': None}
{'user_id': 7, 'session_id': '7_197333', 'state': 'sale', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 13, 51), 'product': 'product_95', 'purchase_price': 35.73}
{'user_id': 8, 'session_id': '8_194982', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 0, 36), 'product': 'product_92', 'purchase_price': None}
{'user_id': 8, 'session_id': '8_194982', 'state': 'entry', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 3, 7), 'product': None, 'purchase_price': None}
{'user_id': 8, 'session_id': '8_194982', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 7, 29), 'product': 'product_46', 'purchase_price': None}
{'user_id': 8, 'session_id': '8_194982', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 12, 27), 'product': None, 'purchase_price': None}
{'user_id': 8, 'session_id': '8_194982', 'state': 'sale', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 13, 58), 'product': 'product_3', 'purchase_price': 60.24}
{'user_id': 9, 'session_id': '9_145850', 'state': 'add_to_cart', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 2, 4), 'product': 'product_59', 'purchase_price': None}
{'user_id': 9, 'session_id': '9_145850', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 4, 34), 'product': None, 'purchase_price': None}
{'user_id': 9, 'session_id': '9_145850', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 5, 20), 'product': 'product_65', 'purchase_price': None}
{'user_id': 9, 'session_id': '9_145850', 'state': 'add_to_cart', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 7, 6), 'product': 'product_10', 'purchase_price': None}
{'user_id': 9, 'session_id': '9_145850', 'state': 'sale', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 11, 33), 'product': 'product_98', 'purchase_price': 63.64}
{'user_id': 10, 'session_id': '10_419800', 'state': 'product_view', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 1, 20), 'product': 'product_63', 'purchase_price': None}
{'user_id': 10, 'session_id': '10_419800', 'state': 'add_to_cart', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 1, 45), 'product': 'product_58', 'purchase_price': None}
{'user_id': 10, 'session_id': '10_419800', 'state': 'check_in', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 6, 18), 'product': None, 'purchase_price': None}
{'user_id': 10, 'session_id': '10_419800', 'state': 'add_to_cart', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 9, 45), 'product': 'product_18', 'purchase_price': None}
{'user_id': 10, 'session_id': '10_419800', 'state': 'sale', 'time_stamp': datetime.datetime(2023, 1, 1, 0, 14, 33), 'product': 'product_61', 'purchase_price': 22.93}
can you use the python faker to create a user genrator and a product generator then incorporate these into the about event_genrator adding support for units_sold
and the page_address based on the product. The products should have a category and thier price should be drawn from a category level normal distribution
Citation
@online{bochman2023,
author = {Bochman, Oren},
title = {Event Generator},
date = {2023-02-16},
url = {https://orenbochman.github.io/posts/2024/2023-03-16-events-generator/event_generator.html},
langid = {en}
}