Copyright © Code Fetcher 2022
In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, regexp_tokenize
import string # for punctuation
import ipynb_utils as ipyutils # custom variables and utility functions
In [2]:
# load data
data_path = '../data/scrapes.json'
post_df = pd.read_json(data_path, orient='index')
post_df.shape
Out[2]:
(8718, 6)
In [3]:
post_df.head()
Out[3]:
uid | time | title | body-text | media | comments | |
---|---|---|---|---|---|---|
0 | 45Newbiequestionsaboutascendantsandborders601 | 2022-09-05 | Newbie questions about ascendants and borders | I’m new to actually learning astrology, not ju… | 0 | |
1 | 100Thousandsofunchartedplanetsatyourfingertips… | Thousands of uncharted planets at your fingert… | True | 0 | ||
2 | 34Astrologyandcognitivedissonance323 | 2022-09-05 | Astrology and cognitive dissonance | Open to anyone who wouldn’t mind sharing a rec… | 1 | |
3 | 38whatdoy’allthinkofpersonacharts?180 | 2022-09-05 | what do y’all think of persona charts? | I feel a bit skeptical of them, since I feel l… | 0 | |
4 | 160RESOURCEREQUEST:Videos(orarticles)withtips/… | 2022-09-05 | RESOURCE REQUEST: Videos (or articles) with ti… | I think my problem is that I don’t know the pr… | 2 |
In [4]:
post_df[post_df['time'] == ''].head() # are these all ads?
Out[4]:
uid | time | title | body-text | media | comments | |
---|---|---|---|---|---|---|
1 | 100Thousandsofunchartedplanetsatyourfingertips… | Thousands of uncharted planets at your fingert… | True | 0 | ||
9 | 136GetthefastestFiberInternetinthetri-statefro… | Get the fastest Fiber Internet in the tri-stat… | True | 0 | ||
17 | 214Thebestplayersinthesoccerworldcometogetherf… | The best players in the soccer world come toge… | True | 0 | ||
25 | 254Yougottabereadyforanythingifyouwanttokeepup… | You gotta be ready for anything if you want to… | True | 0 | ||
40 | 105Getfiredupforthebestseasonyetwithnewsoccerc… | Get fired up for the best season yet with new … | True | 0 |
In [5]:
# save a filter for these rows that might be ads
adfilter = post_df['time'] == ''
In [6]:
# inspect these rows in a bit more detail
post_df[adfilter]['title']
Out[6]:
1 Thousands of uncharted planets at your fingert... 9 Get the fastest Fiber Internet in the tri-stat... 17 The best players in the soccer world come toge... 25 You gotta be ready for anything if you want to... 40 Get fired up for the best season yet with new ... ... 2816 Fan of sports?⚽ ⚾ Enjoy the sports betting ex... 2817 The only thing more nerve-racking than proposi... 2818 Explore new ways to play with hundreds of game... 2819 Stem cell therapy is safe over the long term a... 2820 I'm developing an open-world space ARPG, comin... Name: title, Length: 305, dtype: object
In [7]:
# conclusion - all the posts with no timestamp are ads.
# Even if they aren't all ads, I'm willing to forego this smallish subset
# of entries that mostly sound suspiciously like ads..
post_df.drop(post_df[adfilter].index, inplace=True)
post_df.shape
Out[7]:
(8413, 6)
In [8]:
# convert media column to 0/1 (boolean)
post_df['media'] = (post_df['media'] == 'True').astype(int)
post_df.dtypes
Out[8]:
uid object time object title object body-text object media int64 comments int64 dtype: object
Columns are about as clean as can be for now.
In [9]:
post_df.head()
Out[9]:
uid | time | title | body-text | media | comments | |
---|---|---|---|---|---|---|
0 | 45Newbiequestionsaboutascendantsandborders601 | 2022-09-05 | Newbie questions about ascendants and borders | I’m new to actually learning astrology, not ju… | 0 | 0 |
2 | 34Astrologyandcognitivedissonance323 | 2022-09-05 | Astrology and cognitive dissonance | Open to anyone who wouldn’t mind sharing a rec… | 0 | 1 |
3 | 38whatdoy’allthinkofpersonacharts?180 | 2022-09-05 | what do y’all think of persona charts? | I feel a bit skeptical of them, since I feel l… | 0 | 0 |
4 | 160RESOURCEREQUEST:Videos(orarticles)withtips/… | 2022-09-05 | RESOURCE REQUEST: Videos (or articles) with ti… | I think my problem is that I don’t know the pr… | 0 | 2 |
5 | 64peoplewhohavehadsaturntransittheir10th,whatw… | 2022-09-05 | people who have had saturn transit their 10th,… | How did it affect your career? Did it impact y… | 0 | 11 |
In [10]:
ipyutils.PAT_TOKEN # settled on using the same as CountVectorizer default
Out[10]:
'(?u)\b\w\w+\b'
In [11]:
# title word count
post_df['title-wc'] = [len(regexp_tokenize(t, ipyutils.PAT_TOKEN))
for t in post_df['title']]
post_df['title-wc'].head(2)
Out[11]:
0 6 2 4 Name: title-wc, dtype: int64
In [12]:
# title character count
post_df['title-cc'] = [len(t) for t in post_df['title']]
post_df['title-cc'].head(2)
Out[12]:
0 45 2 34 Name: title-cc, dtype: int64
In [13]:
# body word count
post_df['body-wc'] = [len(regexp_tokenize(t, ipyutils.PAT_TOKEN))
for t in post_df['body-text']]
post_df['body-wc'].head()
Out[13]:
0 107 2 49 3 29 4 94 5 22 Name: body-wc, dtype: int64
In [14]:
# body character count
post_df['body-cc'] = [len(t) for t in post_df['body-text']]
post_df['body-cc'].head()
Out[14]:
0 601 2 323 3 180 4 597 5 116 Name: body-cc, dtype: int64
In [15]:
# reorganize columns and remove uid column - only needed it to find duplicates
post_df = post_df[['time','title','body-text',
'title-cc','title-wc','body-cc','body-wc',
'media','comments']]
post_df.columns
Out[15]:
Index(['time', 'title', 'body-text', 'title-cc', 'title-wc', 'body-cc', 'body-wc', 'media', 'comments'], dtype='object')
In [16]:
post_df.reset_index()
post_df.to_json('../data/scrapes-clean.json', orient='index')
In [17]:
# END