Your data can be formatted in a few different ways, chief among them wide versus long.
Wide data looks like this:
| State | 1960 | 1970 | 1980 | 1990 | 2000 |
|---|---|---|---|---|---|
| New York | 2 | 5 | 2 | 5 | 4 |
| New Jersey | 3 | 1 | 4 | 1 | 5 |
| Arizona | 3 | 9 | 8 | 7 | 5 |
While long data looks like this:
| State | Year | Value |
|---|---|---|
| New York | 1960 | 2 |
| New York | 1970 | 5 |
| New York | 1980 | 2 |
| New York | 1990 | 5 |
| New York | 2000 | 4 |
| New Jersey | 1960 | 3 |
| New Jersey | 1970 | 1 |
| New Jersey | 1980 | 4 |
| … | … | … |
Which do you want? Depends on the application! I can think of exactly zero examples at the moment.
While you could convert between the two in pandas without too much work, I like to use the original data files in my work whenever I can. Not only does it decrease the number of mistakes you might make, but it allows you to easily update to a new release (of the census, for example) by just dropping in a new csv. Converting in d3 isn’t too tough, either.
Usually you can use part of these methods to get to where you’re going, but I’m just going full-out just in case.
Your data, which we’ll call long.csv
| State | Year | Value |
|---|---|---|
| New York | 1960 | 2 |
| New York | 1970 | 5 |
| New Jersey | 1960 | 3 |
| New Jersey | 1970 | 1 |
| Arizona | 1960 | 7 |
| Arizona | 1970 | 2 |
| … | … | … |
Your data would look like
[
{ "State": "New York", "Year": 1960, "Value": 2 },
{ "State": "New York", "Year": 1970, "Value": 5 },
{ "State": "New Jersey", "Year": 1960, "Value": 3 },
// etc
]d3.map()queuevar state_map = d3.map();
queue().
defer(d3.csv, "../long.csv", function(row) {
// Try to get the state, if it doesn't exist make a new one
var datapoint = state_map.get(row["State"]) || {"State": row["State"]};
// row["Year"] becomes, say, 1960, and row["Value"] is 2, so really it's
// datapoint[1960] = 2;
datapoint[row["Year"]] = row["Value"];
// return the unadultered row to go be passed to ready
return row;
})
.await(ready);Then, later…
state_map.get("New York");
// { "State": "New York", "1960": 2, "1970": 5 }
state_map.values();
// [
// { "State": "New York", "1960": 2, "1970": 5 },
// { "State": "New Jersey", "1960": 3, "1970": 1 },
// { "State": "Arizona", "1960": 7, "1970": 1 },
// ...
// ]d3.map()d3.nest()d3.csv("../long.csv", function(error, long_data) {
// data looks like
// [
// { "State": "New York", "Year": 1960, "Value": 2 },
// { "State": "New York", "Year": 1970, "Value": 5 },
// { "State": "New Jersey", "Year": 1960, "Value": 3 },
// ... etc
var wide = d3.nest()
.key(function(d) { return d["State"] }) // sort by key
.rollup(function(d) { // do this to each grouping
// reduce takes a list and returns one value
// in this case, the list is all the grouped elements
// and the final value is an object with keys
return d.reduce(function(prev, curr) {
prev["State"] = curr["State"];
prev[curr["Year"]] = curr["Value"];
return prev;
}, {});
})
.entries(long_data) // tell it what data to process
.map(function(d) { // pull out only the values
return d.values;
});
})wide would look like
[
{ "State": "New York", "1960": 2, "1970": 5 },
{ "State": "New Jersey", "1960": 3, "1970": 1 },
{ "State": "Arizona", "1960": 7, "1970": 1 },
// etc
]If you left off the last .map() section, wide would be keyed according to the state name, and would not be an array but instead look like this instead:
{
"New York": { "State": "New York", "1960": 2, "1970": 5 },
"New Jersey": { "State": "New Jersey", "1960": 3, "1970": 1 },
"Arizona": { "State": "Arizona", "1960": 7, "1970": 1 },
// etc
} You can find out more about .reduce over here
Your data, which we’ll call long.csv
| State | 1960 | 1970 | 1980 | 1990 | 2000 |
|---|---|---|---|---|---|
| New York | 2 | 5 | 2 | 5 | 4 |
| New Jersey | 3 | 1 | 4 | 1 | 5 |
| Arizona | 3 | 9 | 8 | 7 | 5 |
Your data would look like
[
{ "../State": "New York", "1960": 2, "1970": 5 },
{ "State": "New Jersey", "1960": 3, "1970": 1 },
{ "State": "Arizona", "1960": 7, "1970": 1 },
...
]queuereadyJavaScript
var long_data = [];
queue().
defer(d3.csv, "wide.csv", function(row) {
// Loop through all of the columns, and for each column
// make a new row
Object.keys(row).forEach( function(colname) {
// Ignore 'State' and 'Value' columns
if(colname == "State" || colname == "Value") {
return
}
long_data.push({"State": row["State"], "Value": row[colname], "Year": colname});
});
return row;
})
.await(ready);Then, later, long_data would look like this:
[
{ "State": "New York", "Year": 1960, "Value": 2 },
{ "State": "New York", "Year": 1970, "Value": 5 },
{ "State": "New Jersey", "Year": 1960, "Value": 3 },
// etc
]This is honestly the same thing as Method A just wrapped a little differently.
queueJavaScript
d3.csv(d3.csv, "../wide.csv", function(wide_data) {
var long_data = [];
wide_data.forEach( function(row) {
// Loop through all of the columns, and for each column
// make a new row
Object.keys(row).forEach( function(colname) {
// Ignore 'State' and 'Value' columns
if(colname == "State" || colname == "Value") {
return
}
long_data.push({"State": row["State"], "Value": row[colname], "Year": colname});
});
});
// do magic with long_data down here
})Then, later, long_data would look like this:
[
{ "State": "New York", "Year": 1960, "Value": 2 },
{ "State": "New York", "Year": 1970, "Value": 5 },
{ "State": "New Jersey", "Year": 1960, "Value": 3 },
// etc
]