############################# # Examples using pandas # # ############################# import numpy as np import pandas as pd from pprint import pprint # Array to hold data myData = [] dataFile = open("data.csv", "r") # Read data into array line = dataFile.readline() while (line): if line.endswith('\n'): #remove the new line symbol from the line line = line[:-1] myData.append(line) line = dataFile.readline() dataFile.close() pprint("\nReading array") pprint(myData) # Read in data directly into pandas myDataFrame = pd.read_csv('data.csv' , sep=',', encoding='latin1') print("\n\nData Frame format") pprint(myDataFrame) # Print first 3 rows print("\n\nData Frame first three rows") pprint(myDataFrame[:3]) # Print 'Longitude' column print("\n\nData Frame Longitude") pprint(myDataFrame['Longitude']) # Print first 5 rows of user ratings print("\n\nData Frame first 5 rows of User Raitings Column") pprint(myDataFrame['User Ratings'][:5]) # Print the first 5 rows of lat, long print("\n\nData Frame Lat, Long") pprint(myDataFrame[['Latitude', 'Longitude']][:5]) # See counts of each value for User Ratings ratingCounts = myDataFrame['User Ratings'].value_counts() print("\n\nUser rating counts") pprint(ratingCounts) # Find all the rows in West Hills California caRows = myDataFrame["City, State"] == "WEST HILLS, CA" print("\n\nMatching rows") pprint(myDataFrame[caRows]) # Find all rows in CA print("\n\nMatching CA rows") caRows2 = myDataFrame["City, State"].str.contains("CA", na=False) #caRows2 = zip(*myDataFrame["City, State"].apply(lambda x: x.contains('CA'))) pprint(caRows2) pprint(myDataFrame[caRows2]) # Find all the rows with a list of values valuelist = ['1', '2', 'a'] frameWithValues = myDataFrame[myDataFrame["User Ratings"].isin(valuelist)] print("\n\nValue List") print(frameWithValues) # Find the unique rows in a data frame column ratingRows = pd.unique(myDataFrame["User Ratings"].ravel()) print("\n\nUnique Ratings") print(ratingRows) ##################### # Clean the data ##################### # Fix the upper and lower case myDataFrame["City, State"] = myDataFrame["City, State"].str.upper() print("\n\nUpper Case") pprint(myDataFrame[:10]) # Zip is an iterator in a pandas data frame. Lambda is an anonymous function # This code uses an anaonymous function to split the 'City, State' cell into two data series # and assigns them to new columns in the data frame myDataFrame['City'], myDataFrame['State'] = zip(*myDataFrame['City, State'].apply(lambda x: x.split(','))) print("\n\nNew DataFrame columns") pprint(myDataFrame[:10]) # Delete a column from a data frame # Delete column from DataFrame del myDataFrame['City, State'] print("\n\nRemove column") pprint(myDataFrame[:10]) # Iterate through data frame print("\n\nIterate through") for index, row in myDataFrame.iterrows(): print(index, row['State']) # Sort data by multiple columns myDataFrame = myDataFrame.sort_values(by=['State','City'],ascending=[1,0]) pprint(myDataFrame[:10]) # Write the data to a file myFileName="out.txt" myDataFrame.to_csv(myFileName)