Market Research For Ads Target


# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
database = '/kaggle/input/project-files/data.sqlite'
conn = sqlite3.connect(database)
tables = pd.read_sql("""SELECT *
FROM sqlite_master
WHERE type='table'
AND tbl_name IN (
'posts',
'comments',
'users',
'votes'
);""", conn)
tables
comments = pd.read_sql("""SELECT * FROM comments;""", conn)
comments.info()
comments.isna().any().sum()
# Convert date columns to datetime data type
comments['CreationDate'] = pd.to_datetime(comments['CreationDate'])
# Remove any leading or trailing white spaces from text columns
comments['Text'] = comments['Text'].str.strip()
comments.info()
posts = pd.read_sql("""SELECT * FROM posts;""", conn)
posts.info()
posts.isna().any().sum()
# Convert date columns to datetime data type
posts['CreationDate'] = pd.to_datetime(posts['CreationDate'])
posts['LastEditDate'] = pd.to_datetime(posts['LastEditDate'])
posts['LastActivityDate'] = pd.to_datetime(posts['LastActivityDate'])
posts['ClosedDate'] = pd.to_datetime(posts['ClosedDate'])
posts.head()
users = pd.read_sql("""SELECT
Id,
Reputation,
CreationDate,
DisplayName,
LastAccessDate,
WebsiteUrl,
Location,
Age,
Views,
UpVotes,
DownVotes
FROM users;""", conn)
users.info()
users.isna().any().sum()
# Convert date columns to datetime data type
users['CreationDate'] = pd.to_datetime(users['CreationDate'])
# Remove any leading or trailing white spaces from text columns
users['WebsiteUrl'] = users['WebsiteUrl'].str.strip()
users['Location'] = users['Location'].str.strip()
users.info()
votes = pd.read_sql("""SELECT * FROM votes;""", conn)
votes.info()
votes.isna().any().sum()
# Convert date columns to datetime data type
votes['CreationDate'] = pd.to_datetime(votes['CreationDate'])
votes.info()
posts_by_year = pd.read_sql("""SELECT
COUNT(*) AS posts_count,
STRFTIME('%Y', CreationDate) AS year
FROM posts
GROUP BY STRFTIME('%Y', CreationDate);""", conn)
posts_by_year
votes_by_year = pd.read_sql("""SELECT
COUNT(*) AS votes_count,
STRFTIME('%w', CreationDate) AS day
FROM votes
GROUP BY STRFTIME('%w', CreationDate);""", conn)
votes_by_year
votes_by_year = pd.read_sql("""SELECT
COUNT(*) AS comments_count,
STRFTIME('%w', CreationDate) AS day
FROM comments
GROUP BY STRFTIME('%w', CreationDate);""", conn)
votes_by_year
Group the data by the location of the post and count number of votes per post for each voter’s location.
voting_df = pd.read_sql("""
SELECT
p.id AS PostId,
p.AuthorID,
p.AuthorLocation,
v.id AS VoterId,
NULLIF(v.location, '') VoterLocation,
COALESCE(v.Votes, 0) Votes
FROM (
SELECT
p.id,
u.id AS AuthorID,
u.location AS AuthorLocation
FROM posts p
INNER JOIN users u
ON p.OwnerUserId = u.id
) p
LEFT JOIN (
SELECT
v.PostId,
u.id,
u.location,
COUNT(*) Votes
FROM votes v
INNER JOIN users u
ON v.UserId = u.id
GROUP BY u.location
) v
ON p.id = v.PostId
;
""", conn)
voting_df
Group the data by the location of the post and count number of comments per post for each commenters’s location.
commenting_df = pd.read_sql("""
SELECT
p.id AS PostId,
p.AuthorID,
p.AuthorLocation,
c.id AS CommenterId,
NULLIF(c.location, '') CommenterLocation,
COALESCE(c.Comments, 0) Comments
FROM (
SELECT
p.id,
u.id AS AuthorID,
u.location AS AuthorLocation
FROM posts p
INNER JOIN users u
ON p.OwnerUserId = u.id
) p
LEFT JOIN (
SELECT
c.PostId,
u.id,
u.location,
COUNT(*) Comments
FROM comments c
INNER JOIN users u
ON c.UserId = u.id
GROUP BY u.location
) c
ON p.id = c.PostId
;
""", conn)
commenting_df
country_dictionary = pd.read_json('/kaggle/input/project-files/country_dictionary.json', typ='series')
country_dictionary
# We also map user location from users
users['Country'] = users.loc[:,'Location'].map(country_dictionary)
users.head()
commenting_df['AuthorCountry'] = commenting_df.loc[:, 'AuthorLocation'].map(country_dictionary)
commenting_df['CommenterCountry'] = commenting_df.loc[:, 'CommenterLocation'].map(country_dictionary)
commenting_df.head()
voting_df['AuthorCountry'] = voting_df.loc[:, 'AuthorLocation'].map(country_dictionary)
voting_df['VoterCountry'] = voting_df.loc[:, 'VoterLocation'].map(country_dictionary)
voting_df.head()
post_tags_df = posts[['Id','Tags']]
post_tags_df.index = post_tags_df['Id']
post_tags_df
tags = pd.DataFrame(post_tags_df[post_tags_df['Tags'] != '' ]['Tags'].str.extractall('\<(.*?)\>') )
tags['PostId'] = tags.index
tags['PostId'] = np.select([True],[tags['PostId'].str[0]],default =tags['PostId'].str[0])
tags.rename(columns={0:'Tag'}, inplace=True)
tags.reset_index(drop=True, inplace=True)
Group the data by the location of the post’s tag and count number of of tag occurance per voter’s location.
voting_tags = pd.merge(voting_df, tags,how='inner', left_on='PostId', right_on='PostId')
voting_tags = voting_tags[['PostId', 'AuthorCountry', 'VoterCountry', 'Tag']]
voting_tags.head(10)
Group the data by the location of the post’s tag and count number of of tag occurance per commenters’s location.
commenting_tags = pd.merge(commenting_df, tags,how='inner', left_on='PostId', right_on='PostId')
commenting_tags = commenting_tags[['PostId', 'AuthorCountry', 'CommenterCountry', 'Tag']]
commenting_tags.head(10)
with pd.ExcelWriter('dataframes.xlsx') as writer:
# Write each DataFrame to a separate sheet in the Excel file
posts.to_excel(writer, sheet_name='Posts', index=False)
votes.to_excel(writer, sheet_name='Votes', index=False)
users.to_excel(writer, sheet_name='Users', index=False)
comments.to_excel(writer, sheet_name='Comments', index=False)
voting_df.to_excel(writer, sheet_name='Voting', index=False)
voting_tags.to_excel(writer, sheet_name='Voting Tags', index=False)
commenting_df.to_excel(writer, sheet_name='Commenting', index=False)
commenting_tags.to_excel(writer, sheet_name='Commenting Tags', index=False)
check_cleaned_df = pd.read_excel('/kaggle/working/dataframes.xlsx')
check_cleaned_df.info()