Copying Marilyn Monroe’s loves from
http://www.thelovesofmarilynmonroe.com/the- lovers/
First, some imports…
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
Read in the data
We’re just going to scrape it from their page.
response = requests.get("http://www.thelovesofmarilynmonroe.com/the-lovers/")
doc = BeautifulSoup(response.text, 'html.parser')
loves = doc.find_all(class_="gallery-project")
elements = []
for love in loves:
element = {
'name': love.find(class_='project-title').text.title(),
'occupation': love.find('strong', text=re.compile('.*Occupation.*')).next_sibling,
'place_of_meeting': love.find('strong', text=re.compile('.*Place.*')).next_sibling,
'year': love.find('strong', text=re.compile('.*Year.*')).next_sibling,
'relationship_type': love.find('strong', text=re.compile('.*Type.*')).next_sibling,
'image_url': love.find('img')['data-src']
}
for key in element.keys():
try:
element[key] = str(element[key].text)
except:
element[key] = str(element[key])
elements.append(element)
df = pd.DataFrame(elements)
df.head()
image_url | name | occupation | place_of_meeting | relationship_type | year | |
---|---|---|---|---|---|---|
0 | https://static1.squarespace.com/static/5623852... | Andre De Dienes | Photographer | Photoshoot | Relationship | 1945 |
1 | https://static1.squarespace.com/static/5623852... | Arthur Miller | Screenwriter | 20th Century Fox | Marriage | 1955 |
2 | https://static1.squarespace.com/static/5623852... | Ben Lyon | Actor | 20th Century Fox | Romance | 1946 |
3 | https://static1.squarespace.com/static/5623852... | Billy Travilla | Costume Designer | Film Set | Romance | 1952 |
4 | https://static1.squarespace.com/static/5623852... | Charlie Chaplin, Jr. | Actor | filmset | Romance | 1947 |
Building a graph
We’ll use the name
and place of meeting
columns to try to build a graph, and
add in the year
, type
and occupation
columns as edge attributes.
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt
# Ignore matplotlib warnings
import warnings
warnings.filterwarnings("ignore")
g = nx.from_pandas_dataframe(df, source='name', target='place_of_meeting', edge_attr=['year', 'relationship_type', 'occupation'])
g.edges(data=True)
[('Andre De Dienes',
'Photoshoot',
{'occupation': 'Photographer',
'relationship_type': 'Relationship',
'year': '1945'}),
('Photoshoot',
'David Conover',
{'occupation': 'Photographer',
'relationship_type': 'Romance',
'year': '1945'}),
('Arthur Miller',
'20th Century Fox',
{'occupation': 'Screenwriter',
'relationship_type': 'Marriage',
'year': '1955'}),
('20th Century Fox',
'Ben Lyon',
{'occupation': 'Actor', 'relationship_type': 'Romance', 'year': '1946'}),
('20th Century Fox',
'Frank Sinatra',
{'occupation': 'Actor',
'relationship_type': 'Relationship',
'year': '1954'}),
('20th Century Fox',
'Hal Schaefer',
{'occupation': 'Vocal Coach',
'relationship_type': 'Rumour',
'year': '1954'}),
('20th Century Fox',
'Joseph Schenck',
{'occupation': 'Company Owner ',
'relationship_type': 'Rumour',
'year': '1947'}),
('20th Century Fox',
'Marlon Brando',
{'occupation': 'Actor ', 'relationship_type': 'Rumour', 'year': '1955'}),
('20th Century Fox',
'Robert Slatzer',
{'occupation': 'Writer/reporter',
'relationship_type': 'Marriage/relationship',
'year': '1946'}),
('20th Century Fox',
'Tommy Zahn',
{'occupation': 'Actor', 'relationship_type': 'Romance', 'year': '1946'}),
('Billy Travilla',
'Film Set',
{'occupation': 'Costume Designer',
'relationship_type': 'Romance',
'year': '1952'}),
('Film Set',
'Edward G. Robinson Jr',
{'occupation': 'Actor', 'relationship_type': 'Romance', 'year': '1956'}),
('Film Set',
'Elia Kazan',
{'occupation': 'Film and Stage Director',
'relationship_type': 'Romance',
'year': '1950'}),
('Film Set',
'Howard Hughes',
{'occupation': 'Film Director',
'relationship_type': 'Romance',
'year': '1952'}),
('Film Set',
'John Carroll',
{'occupation': 'Actor ', 'relationship_type': 'Rumour', 'year': '1947'}),
('Film Set',
'Mel Torme',
{'occupation': 'Musician ', 'relationship_type': 'Romance', 'year': '1955'}),
('Film Set',
'Milton Berle',
{'occupation': 'TV Entertainer ',
'relationship_type': 'Romance',
'year': '1948'}),
('Film Set',
'Yves Montand',
{'occupation': 'Actor', 'relationship_type': 'Romance', 'year': '1959'}),
('Charlie Chaplin, Jr.',
'filmset',
{'occupation': 'Actor', 'relationship_type': 'Romance', 'year': '1947'}),
('Fred Karger',
'Columbia Pictures',
{'occupation': 'Vocal coach',
'relationship_type': 'Relationship',
'year': '1948'}),
('Columbia Pictures',
'Natasha Lytess',
{'occupation': 'Drama coach',
'relationship_type': 'Romance',
'year': '1948'}),
('Georgie Jessel',
'Party',
{'occupation': 'Entertainer',
'relationship_type': 'Rumour',
'year': '1948'}),
('Party',
'Milton H. Greene',
{'occupation': 'Photographer',
'relationship_type': 'Rumour',
'year': '1955'}),
('Party',
'Robert F. Kennedy',
{'occupation': ' ', 'relationship_type': 'Relationship', 'year': '1961'}),
('Party',
'Spyros Skouras',
{'occupation': ' President of 20th Century Fox',
'relationship_type': 'Rumour',
'year': '1951'}),
('Hans Jorgen Lembourn',
'Unknown',
{'occupation': 'Writer', 'relationship_type': 'Rumour', 'year': '1958'}),
('Unknown',
'Sammy Davis Jr',
{'occupation': 'Entertainer',
'relationship_type': 'Romance',
'year': '1954'}),
('Henry Rosenfeld',
'New York',
{'occupation': 'Dress Manufacturer',
'relationship_type': 'Romance',
'year': '1959'}),
('James Bacon',
'Restaurant',
{'occupation': 'Writer (columnist and author)',
'relationship_type': 'Romance',
'year': '1948'}),
('Restaurant',
'Joe Dimaggio',
{'occupation': 'Baseball player',
'relationship_type': 'Marriage',
'year': '1952'}),
('John F. Kennedy',
'Dinner Party',
{'occupation': 'President',
'relationship_type': 'Relationship',
'year': '1950'}),
('Jim Dougherty',
'School',
{'occupation': 'Marine', 'relationship_type': 'Marriage', 'year': '1942'}),
('Johnny Hyde',
'Racquet Club',
{'occupation': 'Agent',
'relationship_type': 'Relationship',
'year': '1949'}),
('Jose Bolanos',
'Mexico City',
{'occupation': 'writer/producer',
'relationship_type': 'Romance',
'year': '1962'}),
('Nico Minardos',
'Film set',
{'occupation': 'Actor',
'relationship_type': 'Relationship',
'year': '1952'}),
('Sydney Chaplin',
'racquets club',
{'occupation': 'Actor', 'relationship_type': 'Romance', 'year': '1947'})]
We don’t need to build the graph again, we can just start drawing it!
Let’s start by pulling the dataframe into a graph…
nx.draw(g)
Which is pretty ugly! So let’s style it a little bit…
plt.figure(figsize=(20, 20))
layout = nx.spring_layout(g, iterations=40, k=0.07)
nx.draw_networkx_edges(g, layout)
nx.draw_networkx_nodes(g, layout)
nx.draw_networkx_labels(g, layout)
# Edge labels!
labels=dict([((u,v,),data['relationship_type']) for u,v,data in g.edges(data=True)])
nx.draw_networkx_edge_labels(g, layout, edge_labels=labels)
plt.axis('off')
plt.show()
Coloring the nodes: Places of meeting vs. the relationships
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 15))
g = nx.from_pandas_dataframe(df, source='name', target='place_of_meeting', edge_attr=['relationship_type', 'year', 'occupation'])
layout = nx.spring_layout(g,iterations=30, k=0.05)
# Pull out the places, draw them
places = [node for node in g.nodes() if node in df.place_of_meeting.unique()]
nx.draw_networkx_nodes(g, layout, nodelist=places, node_color='#8dd3c7', node_size=500)
# Pull out the people, draw them
people = [node for node in g.nodes() if node in df.name.unique()]
nx.draw_networkx_nodes(g, layout, nodelist=people, node_color='#fb8072', node_size=100)
# Draw the edges and the labels
nx.draw_networkx_edges(g, layout)
nx.draw_networkx_labels(g, layout)
plt.axis('off')
plt.show()
But wait, where’s Marilyn?
df.head(3)
image_url | name | occupation | place_of_meeting | relationship_type | year | |
---|---|---|---|---|---|---|
0 | https://static1.squarespace.com/static/5623852... | Andre De Dienes | Photographer | Photoshoot | Relationship | 1945 |
1 | https://static1.squarespace.com/static/5623852... | Arthur Miller | Screenwriter | 20th Century Fox | Marriage | 1955 |
2 | https://static1.squarespace.com/static/5623852... | Ben Lyon | Actor | 20th Century Fox | Romance | 1946 |
g = nx.Graph()
for i, row in df.iterrows():
# Create a node for Andre, for Arthur, etc
# And add the rest of the row as attributes for that node
g.add_node(row['name'], attr_dict=row.to_dict())
# Then draw an edge between "Arthur Miller" and "Marilyn Monrow"
g.add_edge("Marilyn Monroe", row['name'])
nx.draw(g)
df.head(3)
image_url | name | occupation | place_of_meeting | relationship_type | year | |
---|---|---|---|---|---|---|
0 | https://static1.squarespace.com/static/5623852... | Andre De Dienes | Photographer | Photoshoot | Relationship | 1945 |
1 | https://static1.squarespace.com/static/5623852... | Arthur Miller | Screenwriter | 20th Century Fox | Marriage | 1955 |
2 | https://static1.squarespace.com/static/5623852... | Ben Lyon | Actor | 20th Century Fox | Romance | 1946 |
g = nx.Graph()
# We're going to give each node a NODE_TYPE attribute
# so we can pull them apart later and make them
# look different
g.add_node("Marilyn Monroe", node_type='marilyn')
for i, row in df.iterrows():
# Add the node for the person
g.add_node(row['name'], node_type='person')
# Add the node for the place of meeting
g.add_node(row['place_of_meeting'], node_type='place')
# Add a connection from the place to Marilyn Monroe
# And a connection from the place to the person
# We'll give these EDGE_TYPEs so we can find them later
g.add_edge("Marilyn Monroe", row['place_of_meeting'], edge_type='meeting_place')
g.add_edge(row['place_of_meeting'], row['name'], edge_type=row['relationship_type'])
nx.draw(g)
g.nodes(data=True)
[('Marilyn Monroe', {'node_type': 'marilyn'}),
('Andre De Dienes', {'node_type': 'person'}),
('Photoshoot', {'node_type': 'place'}),
('Arthur Miller', {'node_type': 'person'}),
('20th Century Fox', {'node_type': 'place'}),
('Ben Lyon', {'node_type': 'person'}),
('Billy Travilla', {'node_type': 'person'}),
('Film Set', {'node_type': 'place'}),
('Charlie Chaplin, Jr.', {'node_type': 'person'}),
('filmset', {'node_type': 'place'}),
('David Conover', {'node_type': 'person'}),
('Edward G. Robinson Jr', {'node_type': 'person'}),
('Elia Kazan', {'node_type': 'person'}),
('Frank Sinatra', {'node_type': 'person'}),
('Fred Karger', {'node_type': 'person'}),
('Columbia Pictures', {'node_type': 'place'}),
('Georgie Jessel', {'node_type': 'person'}),
('Party', {'node_type': 'place'}),
('Hal Schaefer', {'node_type': 'person'}),
('Hans Jorgen Lembourn', {'node_type': 'person'}),
('Unknown', {'node_type': 'place'}),
('Henry Rosenfeld', {'node_type': 'person'}),
('New York', {'node_type': 'place'}),
('Howard Hughes', {'node_type': 'person'}),
('James Bacon', {'node_type': 'person'}),
('Restaurant', {'node_type': 'place'}),
('John F. Kennedy', {'node_type': 'person'}),
('Dinner Party', {'node_type': 'place'}),
('Jim Dougherty', {'node_type': 'person'}),
('School', {'node_type': 'place'}),
('Joe Dimaggio', {'node_type': 'person'}),
('John Carroll', {'node_type': 'person'}),
('Johnny Hyde', {'node_type': 'person'}),
('Racquet Club', {'node_type': 'place'}),
('Jose Bolanos', {'node_type': 'person'}),
('Mexico City', {'node_type': 'place'}),
('Joseph Schenck', {'node_type': 'person'}),
('Marlon Brando', {'node_type': 'person'}),
('Mel Torme', {'node_type': 'person'}),
('Milton Berle', {'node_type': 'person'}),
('Milton H. Greene', {'node_type': 'person'}),
('Natasha Lytess', {'node_type': 'person'}),
('Nico Minardos', {'node_type': 'person'}),
('Film set', {'node_type': 'place'}),
('Robert Slatzer', {'node_type': 'person'}),
('Robert F. Kennedy', {'node_type': 'person'}),
('Sammy Davis Jr', {'node_type': 'person'}),
('Spyros Skouras', {'node_type': 'person'}),
('Sydney Chaplin', {'node_type': 'person'}),
('racquets club', {'node_type': 'place'}),
('Tommy Zahn', {'node_type': 'person'}),
('Yves Montand', {'node_type': 'person'})]
[edge for edge in g.edges(data=True)]
[('Marilyn Monroe', 'Photoshoot', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', '20th Century Fox', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'Film Set', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'filmset', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'Columbia Pictures', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'Party', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'Unknown', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'New York', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'Restaurant', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'Dinner Party', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'School', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'Racquet Club', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'Mexico City', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'Film set', {'edge_type': 'meeting_place'}),
('Marilyn Monroe', 'racquets club', {'edge_type': 'meeting_place'}),
('Andre De Dienes', 'Photoshoot', {'edge_type': 'Relationship'}),
('Photoshoot', 'David Conover', {'edge_type': 'Romance'}),
('Arthur Miller', '20th Century Fox', {'edge_type': 'Marriage'}),
('20th Century Fox', 'Ben Lyon', {'edge_type': 'Romance'}),
('20th Century Fox', 'Frank Sinatra', {'edge_type': 'Relationship'}),
('20th Century Fox', 'Hal Schaefer', {'edge_type': 'Rumour'}),
('20th Century Fox', 'Joseph Schenck', {'edge_type': 'Rumour'}),
('20th Century Fox', 'Marlon Brando', {'edge_type': 'Rumour'}),
('20th Century Fox',
'Robert Slatzer',
{'edge_type': 'Marriage/relationship'}),
('20th Century Fox', 'Tommy Zahn', {'edge_type': 'Romance'}),
('Billy Travilla', 'Film Set', {'edge_type': 'Romance'}),
('Film Set', 'Edward G. Robinson Jr', {'edge_type': 'Romance'}),
('Film Set', 'Elia Kazan', {'edge_type': 'Romance'}),
('Film Set', 'Howard Hughes', {'edge_type': 'Romance'}),
('Film Set', 'John Carroll', {'edge_type': 'Rumour'}),
('Film Set', 'Mel Torme', {'edge_type': 'Romance'}),
('Film Set', 'Milton Berle', {'edge_type': 'Romance'}),
('Film Set', 'Yves Montand', {'edge_type': 'Romance'}),
('Charlie Chaplin, Jr.', 'filmset', {'edge_type': 'Romance'}),
('Fred Karger', 'Columbia Pictures', {'edge_type': 'Relationship'}),
('Columbia Pictures', 'Natasha Lytess', {'edge_type': 'Romance'}),
('Georgie Jessel', 'Party', {'edge_type': 'Rumour'}),
('Party', 'Milton H. Greene', {'edge_type': 'Rumour'}),
('Party', 'Robert F. Kennedy', {'edge_type': 'Relationship'}),
('Party', 'Spyros Skouras', {'edge_type': 'Rumour'}),
('Hans Jorgen Lembourn', 'Unknown', {'edge_type': 'Rumour'}),
('Unknown', 'Sammy Davis Jr', {'edge_type': 'Romance'}),
('Henry Rosenfeld', 'New York', {'edge_type': 'Romance'}),
('James Bacon', 'Restaurant', {'edge_type': 'Romance'}),
('Restaurant', 'Joe Dimaggio', {'edge_type': 'Marriage'}),
('John F. Kennedy', 'Dinner Party', {'edge_type': 'Relationship'}),
('Jim Dougherty', 'School', {'edge_type': 'Marriage'}),
('Johnny Hyde', 'Racquet Club', {'edge_type': 'Relationship'}),
('Jose Bolanos', 'Mexico City', {'edge_type': 'Romance'}),
('Nico Minardos', 'Film set', {'edge_type': 'Relationship'}),
('Sydney Chaplin', 'racquets club', {'edge_type': 'Romance'})]
Let’s draw it more nicely
plt.figure(figsize=(15, 15))
layout = nx.spring_layout(g,iterations=50, k=0.05)
nx.draw_networkx_edges(g, layout)
# Highlight the marriages
marriages = [(source, target) for source, target, data in g.edges(data=True) if data['edge_type'] == "Marriage"]
nx.draw_networkx_edges(g, layout, edgelist=marriages, edge_color='orange', width=3)
romances = [(source, target) for source, target, data in g.edges(data=True) if data['edge_type'] == "Romance"]
nx.draw_networkx_edges(g, layout, edgelist=romances, edge_color='green', width=3)
marilyn = [node for node, data in g.nodes(data=True) if data['node_type'] == "marilyn"]
places = [node for node, data in g.nodes(data=True) if data['node_type'] == 'place']
people = [node for node, data in g.nodes(data=True) if data['node_type'] == 'person']
nx.draw_networkx_nodes(g, layout, nodelist=marilyn, node_color='lightgrey', node_size=1000)
nx.draw_networkx_nodes(g, layout, nodelist=places, node_color='#8dd3c7', node_size=500)
nx.draw_networkx_nodes(g, layout, nodelist=people, node_color='#fb8072', node_size=100)
nx.draw_networkx_labels(g, layout)
plt.axis('off')
plt.show()
Okay, I give up, let’s take it somewhere else
Save it as an adjacency list
In an adjacency list, the First thing is your node. Everything after that is nodes that it is connected to.
DOWNSIDE: It doesn’t save any extra information for the connections.
nx.write_adjlist(g, 'marilyn_adj_list.txt', delimiter=',')
!head -n 10 marilyn_adj_list.txt
#/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py -f /Users/jonathansoma/Library/Jupyter/runtime/kernel-4b7f6739-ab9a-4afa-b889-5b372b85fc96.json
# GMT Thu Aug 24 15:50:01 2017
#
Marilyn Monroe,Photoshoot,20th Century Fox,Film Set,filmset,Columbia Pictures,Party,Unknown,New York,Restaurant,Dinner Party,School,Racquet Club,Mexico City,Film set,racquets club
Andre De Dienes,Photoshoot
Photoshoot,David Conover
Arthur Miller,20th Century Fox
20th Century Fox,Ben Lyon,Frank Sinatra,Hal Schaefer,Joseph Schenck,Marlon Brando,Robert Slatzer,Tommy Zahn
Ben Lyon
Billy Travilla,Film Set
Save it as an edge list
This is… a list of edges. The first element is a node, the second element is a node, and then there’s any extra data about that relationship.
DOWNSIDE: Doesn’t store attributes about the nodes
nx.write_edgelist(g, 'marilyn_edge_list.txt', delimiter=',')
!head -n 5 marilyn_edge_list.txt
Marilyn Monroe,Photoshoot,{'edge_type': 'meeting_place'}
Marilyn Monroe,20th Century Fox,{'edge_type': 'meeting_place'}
Marilyn Monroe,Film Set,{'edge_type': 'meeting_place'}
Marilyn Monroe,filmset,{'edge_type': 'meeting_place'}
Marilyn Monroe,Columbia Pictures,{'edge_type': 'meeting_place'}
Save it as a GEXF
GEXF is a fancy XML format that you can use a ton of places!
nx.write_gexf(g, 'marilyn.gexf')
!head -n 10 marilyn.gexf
<?xml version='1.0' encoding='utf-8'?>
<gexf version="1.1" xmlns="http://www.gexf.net/1.1draft" xmlns:viz="http://www.gexf.net/1.1draft/viz" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.w3.org/2001/XMLSchema-instance">
<graph defaultedgetype="undirected" mode="static">
<attributes class="edge" mode="static">
<attribute id="1" title="edge_type" type="string" />
</attributes>
<attributes class="node" mode="static">
<attribute id="0" title="node_type" type="string" />
</attributes>
<nodes>
How do we open this?
Go get Gephi