-
读取csv文件:
import pandas as pd fandango = pd.read_csv('fandango_score_comparison.csv') series_film = fandango['FILM'] print(series_film[0:5]) series_rt = fandango['RottenTomatoes'] print (series_rt[0:5])
运行结果:
-
制作Series
# Import the Series object from pandas from pandas import Series film_names = series_film.values #print type(film_names) #print film_names rt_scores = series_rt.values #print rt_scores series_custom = Series(rt_scores , index=film_names) series_custom[['Minions (2015)', 'Leviathan (2014)']]
运行结果:
-
打印结果
# int index is also aviable series_custom = Series(rt_scores , index=film_names) series_custom[['Minions (2015)', 'Leviathan (2014)']] fiveten = series_custom[5:10] print(fiveten)
运行结果:
-
排序
original_index = series_custom.index.tolist() print (original_index) sorted_index = sorted(original_index) print(sorted_index) sorted_by_index = series_custom.reindex(sorted_index) print (sorted_by_index)
运行结果:
-
排序索引
sc2 = series_custom.sort_index() sc3 = series_custom.sort_values() #print(sc2[0:10]) print(sc3[0:10])
运行结果:
-
相加
#The values in a Series object are treated as an ndarray, the core data type in NumPy import numpy as np # Add each value with each other print (np.add(series_custom, series_custom)) # Apply sine function to each value np.sin(series_custom) # Return the highest value (will return a single value not a Series) np.max(series_custom)
运行结果:
-
判断
#will actually return a Series object with a boolean value for each film series_custom > 50 series_greater_than_50 = series_custom[series_custom > 50] criteria_one = series_custom > 50 criteria_two = series_custom < 75 both_criteria = series_custom[criteria_one & criteria_two] print (both_criteria)
运行结果:
-
运算
#data alignment same index rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM']) rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM']) rt_mean = (rt_critics + rt_users)/2 print(rt_mean)
运行结果:
-
set_index
#will return a new DataFrame that is indexed by the values in the specified column #and will drop that column from the DataFrame #without the FILM column dropped fandango = pd.read_csv('fandango_score_comparison.csv') print type(fandango) fandango_films = fandango.set_index('FILM', drop=False) #print(fandango_films.index)
运行结果:
-
使用新索引
# Slice using either bracket notation or loc[] fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"] fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"] # Specific movie fandango_films.loc['Kumiko, The Treasure Hunter (2015)'] # Selecting list of movies movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)'] fandango_films.loc[movies] #When selecting multiple rows, a DataFrame is returned, #but when selecting an individual row, a Series object is returned instead
运行结果:
-
类型转化
#The apply() method in Pandas allows us to specify Python logic #The apply() method requires you to pass in a vectorized operation #that can be applied over each Series object. import numpy as np # returns the data types as a Series types = fandango_films.dtypes #print types # filter data types to just floats, index attributes returns just column names float_columns = types[types.values == 'float64'].index # use bracket notation to filter columns to just float columns float_df = fandango_films[float_columns] #print float_df # `x` is a Series object representing a column deviations = float_df.apply(lambda x: np.std(x)) print(deviations)
运行结果:
-
匿名函数std()函数用于计算标准差
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']] rt_mt_user.apply(lambda x: np.std(x), axis=1)
运行结果: