Visualizations Showcase
Imdb Top 1000 With Plotapi Heatmap Data Pre Imdb Redesign
import requests
import lxml.html
import itertools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
url = f"https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=200&start=1&ref_=adv_nxt"
response = requests.get(url)
content = response.content
html = lxml.html.fromstring(content)
items = html.xpath("//div[contains(@class, 'lister-item mode-advanced')]")
item = items[1]
name = item.xpath('.//h3[@class="lister-item-header"]//a/text()')[0]
img = item.xpath('.//div[@class="lister-item-image float-left"]//a//img/@loadlate')[0]
rating = item.xpath('.//div[@class="inline-block ratings-imdb-rating"]//strong/text()')[0]
genre = item.xpath('.//span[@class="genre"]//text()')[0].strip()
runtime = item.xpath('.//span[@class="runtime"]//text()')[0].strip()
certificate = item.xpath('.//span[@class="certificate"]//text()')[0].strip()
metascore = item.xpath('.//div[@class="inline-block ratings-metascore"]//span/text()')[0].strip()
url = "https://www.imdb.com" + item.xpath('.//h3[@class="lister-item-header"]//a/@href')[0]
eww = item.xpath('.//p[@class="sort-num_votes-visible"]//span/@data-value')[0].strip()
votes = item.xpath('.//p[@class="sort-num_votes-visible"]//span/@data-value')[1].strip()
[name,img,rating,genre,runtime,certificate,metascore,eww,votes,url]
import requests
import lxml.html
import itertools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
items = []
for start in range(1,1000,200):
url = f"https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=200&start={start}&ref_=adv_nxt"
response = requests.get(url)
content = response.content
html = lxml.html.fromstring(content)
items += html.xpath("//div[contains(@class, 'lister-item mode-advanced')]")
data = pd.DataFrame(columns=['name', 'img', 'rating', 'genre', 'gross'])
for item in items:
name = item.xpath('.//h3[@class="lister-item-header"]//a/text()')[0]
img = item.xpath('.//div[@class="lister-item-image float-left"]//a//img/@loadlate')[0]
rating = item.xpath('.//div[@class="inline-block ratings-imdb-rating"]//strong/text()')[0]
genre = item.xpath('.//span[@class="genre"]//text()')[0].strip()
runtime = item.xpath('.//span[@class="runtime"]//text()')[0].strip()
try:
certificate = item.xpath('.//span[@class="certificate"]//text()')[0].strip()
except:
certificate = "N/A"
try:
metascore = item.xpath('.//div[@class="inline-block ratings-metascore"]//span/text()')[0].strip()
except:
metascore = "N/A"
url = "https://www.imdb.com" + item.xpath('.//h3[@class="lister-item-header"]//a/@href')[0]
try:
eww = item.xpath('.//p[@class="sort-num_votes-visible"]//span/@data-value')[1].strip()
except:
eww = "N/A"
try:
votes = item.xpath('.//p[@class="sort-num_votes-visible"]//span/@data-value')[0].strip()
except:
votes = "N/A"
# eww = item.xpath('.//p[@class="sort-num_votes-visible"]//span[last()]/text()')[0].strip()
# eww = eww if "$" in eww else "N/A"
data = data.append({'name': name, 'img': img, 'rating': rating, 'genre': genre, 'gross': eww,
'runtime': runtime, 'certificate': certificate, 'metascore': metascore, 'votes': votes, 'url': url}, ignore_index=True)
raw_data = data
data['genre'] = data['genre'].str.split(",")
raw_data.head()
raw_data.to_csv("IMDb_top_1000_August.csv")
strip out whitespace
for index, row in data.iterrows():
genre = [x.strip(' ') for x in row.genre]
row.genre = genre
genres = [st for row in data.genre for st in row]
genres = set(genres)
genres = sorted(genres)
colors = ["#e6194B", "#3cb44b", "#FFDE0A", "#4363d8", "#f58231", "#911eb4",
"#42d4f4", "#f032e6", "#bfef45", "#fabebe", "#469990", "#e6beff",
"#9A6324", "#FFEB0A", "#800000", "#00F549", "#a9a9a9", "#FFB870",
"#000075", "#a9a9a9", "#c6d94B"]
def bg_color(genre):
return colors[genres.index(genre)]
df = pd.DataFrame(0, columns=genres, index=genres)
df
data_table = 'Genre_1,Genre_2,,Name,Genres,Rating,Gross'
for index, row in data.iterrows():
if len(row.genre) == 1:
df[row.genre[0]][row.genre[0]] += 1
data_table += f'\n{row.genre[0]},{row.genre[0]},"<img src=""{row.img}"">","<a style=""color:black;"" href=""{row["url"]}"">{row["name"]}</a>","{(", ".join(["<span style=""color:white;background-color:"+bg_color(x)+";display:inline-block;border-radius:10px;padding:5px 7px;color:white;background-color:#444"">"+x+"</span>" for x in row.genre]))}","<span style=""display:inline-block;border-radius:10px;padding:5px 7px;color:white;background-color:#444"">{row.rating}</span>","{row.gross}"'
else:
for genre in list(itertools.combinations(row.genre, 2)):
df[genre[0]][genre[1]] += 1
df[genre[1]][genre[0]] += 1
data_table += f'\n{genre[0]},{genre[1]},"<img src=""{row.img}"">","<a style=""color:black;"" href=""{row["url"]}"">{row["name"]}</a>","{("<br>".join(["<span style=""margin-bottom:2px;color:white;background-color:"+bg_color(x)+";display:inline-block;border-radius:10px;padding:5px 7px;color:white;background-color:#444"">"+x+"</span>" for x in row.genre]))}","<span style=""display:inline-block;border-radius:10px;padding:5px 7px;color:white;background-color:#444"">{row.rating}</span>","{row.gross}"'
f'\n{"<br>".join(["<span style=""background-color:"+bg_color(x)+";display:inline-block;border-radius:10px;padding:5px 7px;color:white;background-color:#444"">"+x+"</span>" for x in row.genre])}'
from plotapi import Chord
import json
from plotapi import Chord
Chord.api_key("5dc347fb-d747-474f-93e8-4e1a9f5d41dd")
# Visualization code here
Chord(df.values.tolist(), genres, colors="movies",data_table=data_table, data_table_show_indices=False, data_table_unique_column="Name", compress=True, data_table_column_width=90, noun="titles", padding=0.03,animated_intro=True
).show()
import json
data = {"matrix": df.values.tolist(),
"names": genres,
"data_table": data_table}
with open("imdb_top_1000.json", "w") as fp:
json.dump(data, fp)
df
genres