课题组每周研讨会
pip install pandas
import pandas as pd
是带标签的一维数组
In [3]: s = pd.Series([1, 3, 5, 6, 8])
In [4]: s
Out[4]:
0 1
1 3
2 5
3 6
4 8
dtype: int64
左边是索引,如果没有索引,会自动创建一个从0开始的索引。
In [5]: s = pd.Series([1, 3, 5, 6, 8], index = ['a', 'b', 'c', 'd', 'e'])
In [6]: s
Out[6]:
a 1
b 3
c 5
d 6
e 8
dtype: int64
可以通过index
和values
来访问索引和值
In [12]: s.index
Out[12]: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
In [13]: s.values
Out[13]: array([1, 3, 5, 6, 8], dtype=int64)
In [14]: s['a']
Out[14]: 1
In [15]: s[['a', 'b']]
Out[15]:
a 1
b 3
dtype: int64
In [16]: s2 = s * 2
In [17]: s2
Out[17]:
a 2
b 6
c 10
d 12
e 16
dtype: int64
In [18]: s2[s > 3] #选取s中大于3的位置在s2中对应的值
Out[18]:
c 10
d 12
e 16
dtype: int64
In [22]: df = pd.DataFrame({"A" : 1.,
"B" : pd.Timestamp('20200902'),
"C" : pd.Series(range(4), index = list(range(4)), dtype="float32")})
In [50]: df
Out[50]:
A B C
0 1.0 2020-09-02 0.0
1 1.0 2020-09-02 1.0
2 1.0 2020-09-02 2.0
3 1.0 2020-09-02 3.0
In [26]: df2 = pd.read_table("/path/to/sample.tsv")
In [27]: df2
Out[27]:
samples plat fq1 fq2 type Patient
0 A ILLUMINA a_1.fastq a_2.fastq T 1
1 B ILLUMINA b_1.fastq b_2.fastq N 1
2 C ILLUMINA c_1.fastq c_2.fastq T 2
3 D ILLUMINA d.fastq NaN N 2
In [33]: df.dtypes
Out[33]:
A float64
B datetime64[ns]
C float32
dtype: object
In [52]: df.head(2)
Out[52]:
A B C
0 1.0 2020-09-02 0.0
1 1.0 2020-09-02 1.0
In [53]: df.tail(2)
Out[53]:
A B C
2 1.0 2020-09-02 2.0
3 1.0 2020-09-02 3.0
In [40]: df.index
Out[40]: Int64Index([0, 1, 2, 3], dtype='int64')
In [41]: df.columns
Out[41]: Index(['A', 'B', 'C'], dtype='object')
In [57]: df.describe()
Out[57]:
A C
count 4.0 4.000000
mean 1.0 1.500000
std 0.0 1.290994
min 1.0 0.000000
25% 1.0 0.750000
50% 1.0 1.500000
75% 1.0 2.250000
max 1.0 3.000000
In [58]: df.T
Out[58]:
0 1 2 3
A 1 1 1 1
B 2020-09-02 00:00:00 2020-09-02 00:00:00 2020-09-02 00:00:00 2020-09-02 00:00:00
C 0 1 2 3
In [59]: df.sort_index(axis=0, ascending=False)
Out[59]:
A B C
3 1.0 2020-09-02 3.0
2 1.0 2020-09-02 2.0
1 1.0 2020-09-02 1.0
0 1.0 2020-09-02 0.0
In [60]: df.sort_index(axis=1, ascending=False)
Out[60]:
C B A
0 0.0 2020-09-02 1.0
1 1.0 2020-09-02 1.0
2 2.0 2020-09-02 1.0
3 3.0 2020-09-02 1.0
In [63]: df.sort_values(by = 'C', ascending=False)
Out[63]:
A B C
3 1.0 2020-09-02 3.0
2 1.0 2020-09-02 2.0
1 1.0 2020-09-02 1.0
0 1.0 2020-09-02 0.0
In [65]: df.A
Out[65]:
0 1.0
1 1.0
2 1.0
3 1.0
Name: A, dtype: float64
与df['A']
等效
In [67]: df[0:3]
Out[67]:
A B C
0 1.0 2020-09-02 0.0
1 1.0 2020-09-02 1.0
2 1.0 2020-09-02 2.0
.loc
In [79]: df.lofd
Out[79]:
0 0.0
1 1.0
2 2.0
3 3.0
Name: C, dtype: float32
In [80]: df.loc[[0, 2], "C"]
Out[80]:
0 0.0
2 2.0
Name: C, dtype: float32
In [81]: df.loc[[0, 2], :]
Out[81]:
A B C
0 1.0 2020-09-02 0.0
2 1.0 2020-09-02 2.0
In [82]: df.loc[0]
Out[82]:
A 1
B 2020-09-02 00:00:00
C 0
Name: 0, dtype: object
.iloc
按照位置选取
In [86]: df
Out[86]:
A B C
0 1.0 2020-09-02 0.0
1 1.0 2020-09-02 1.0
2 1.0 2020-09-02 2.0
3 1.0 2020-09-02 3.0
In [87]: df.iloc[3]
Out[87]:
A 1
B 2020-09-02 00:00:00
C 3
Name: 3, dtype: object
In [88]: df.iloc[1:3, :
...: ]
Out[88]:
A B C
1 1.0 2020-09-02 1.0
2 1.0 2020-09-02 2.0
In [89]: df.iloc[1:3, 0:2]
Out[89]:
A B
1 1.0 2020-09-02
2 1.0 2020-09-02
In [91]: df.iloc[[1, 0, 3], [2, 0]]
Out[91]:
C A
1 1.0 1.0
0 0.0 1.0
3 3.0 1.0
In [93]: df[df.C > 1.0]
Out[93]:
A B C
2 1.0 2020-09-02 2.0
3 1.0 2020-09-02 3.0
用isin()
选取
In [101]: df2 = df.copy()
In [102]: df2['E'] = ['one', 'one', 'two', 'three']
In [103]: df2
Out[103]:
A B C E
0 1.0 2020-09-02 0.0 one
1 1.0 2020-09-02 1.0 one
2 1.0 2020-09-02 2.0 two
3 1.0 2020-09-02 3.0 three
In [104]: df2[df2['E'].isin(['one', 'three'])]
Out[104]:
A B C E
0 1.0 2020-09-02 0.0 one
1 1.0 2020-09-02 1.0 one
3 1.0 2020-09-02 3.0 three
按照索引自动对齐
In [105]: s1 = pd.Series([1,2,3], index = [0, 2, 3])
In [106]: s1
Out[106]:
0 1
2 2
3 3
dtype: int64
In [107]: df['F'] = s1
In [108]: df
Out[108]:
A B C F
0 1.0 2020-09-02 0.0 1.0
1 1.0 2020-09-02 1.0 NaN
2 1.0 2020-09-02 2.0 2.0
3 1.0 2020-09-02 3.0 3.0
按照标签赋值
In [114]: df.at[s1[0], 'A'] = 0
In [115]: df
Out[115]:
A B C F
0 1.0 2020-09-02 0.0 1.0
1 0.0 2020-09-02 1.0 NaN
2 1.0 2020-09-02 2.0 2.0
3 1.0 2020-09-02 3.0 3.0
按照位置赋值
In [116]: df.iat[0, 1] = 0
In [117]: df
Out[117]:
A B C F
0 1.0 1970-01-01 0.0 1.0
1 0.0 2020-09-02 1.0 NaN
2 1.0 2020-09-02 2.0 2.0
3 1.0 2020-09-02 3.0 3.0
In [118]: df.iat[0, 2] = 0
In [119]: df
Out[119]:
A B C F
0 1.0 1970-01-01 0.0 1.0
1 0.0 2020-09-02 1.0 NaN
2 1.0 2020-09-02 2.0 2.0
3 1.0 2020-09-02 3.0 3.0
In [120]: df.iat[0, 2] = 1
In [121]: df
Out[121]:
A B C F
0 1.0 1970-01-01 1.0 1.0
1 0.0 2020-09-02 1.0 NaN
2 1.0 2020-09-02 2.0 2.0
3 1.0 2020-09-02 3.0 3.0
对缺失值操作
In [133]: df.dropna(how = 'any')
Out[133]:
A B C F
0 1.0 1970-01-01 1.0 1.0
2 1.0 2020-09-02 2.0 2.0
3 1.0 2020-09-02 3.0 3.0
In [136]: df.fillna(value = 5)
Out[136]:
A B C F
0 1.0 1970-01-01 1.0 1.0
1 0.0 2020-09-02 1.0 5.0
2 1.0 2020-09-02 2.0 2.0
3 1.0 2020-09-02 3.0 3.0
concat()
用于连接Pandas对象
In [138]: df = pd.DataFrame(np.random.randn(10, 4))
In [139]: df
Out[139]:
0 1 2 3
0 -0.309396 -2.150641 -0.443746 -0.330269
1 1.185076 -0.810941 0.171741 -0.534870
2 -0.285067 0.126175 0.213095 -1.131187
3 0.102701 -2.332755 0.651257 0.039092
4 -0.533402 0.669336 1.588347 0.097972
5 -0.834056 -1.178372 -0.470339 -0.204647
6 0.505872 -0.031477 -1.822546 1.718209
7 -0.034870 1.283071 0.000530 0.697393
8 -0.824158 -1.421880 -0.160355 0.511930
9 0.772701 -2.059164 -0.770457 1.548149
In [141]: pieces = [df[:3], df[3:7], df[7:]]
In [142]: pieces
Out[142]:
[ 0 1 2 3
0 -0.309396 -2.150641 -0.443746 -0.330269
1 1.185076 -0.810941 0.171741 -0.534870
2 -0.285067 0.126175 0.213095 -1.131187,
0 1 2 3
3 0.102701 -2.332755 0.651257 0.039092
4 -0.533402 0.669336 1.588347 0.097972
5 -0.834056 -1.178372 -0.470339 -0.204647
6 0.505872 -0.031477 -1.822546 1.718209,
0 1 2 3
7 -0.034870 1.283071 0.000530 0.697393
8 -0.824158 -1.421880 -0.160355 0.511930
9 0.772701 -2.059164 -0.770457 1.548149]
连接
In [143]: pd.concat(pieces)
Out[143]:
0 1 2 3
0 -0.309396 -2.150641 -0.443746 -0.330269
1 1.185076 -0.810941 0.171741 -0.534870
2 -0.285067 0.126175 0.213095 -1.131187
3 0.102701 -2.332755 0.651257 0.039092
4 -0.533402 0.669336 1.588347 0.097972
5 -0.834056 -1.178372 -0.470339 -0.204647
6 0.505872 -0.031477 -1.822546 1.718209
7 -0.034870 1.283071 0.000530 0.697393
8 -0.824158 -1.421880 -0.160355 0.511930
9 0.772701 -2.059164 -0.770457 1.548149
追加
为DataFrame追加行
In [144]: df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
In [145]: df
Out[145]:
A B C D
0 0.399386 1.322999 -0.607676 0.640081
1 -0.515899 0.404083 -0.273217 -2.463714
2 -0.124882 0.765749 0.005764 -0.520626
3 -2.035809 0.440678 -0.541148 0.006324
4 -2.391258 1.048370 0.975226 -0.893675
5 -0.484241 0.544847 0.563982 -1.272684
6 1.218637 -0.650592 -1.541189 1.369380
7 -0.142225 -0.219917 0.383717 -1.811725
In [89]: s = df.iloc[3]
In [90]: df.append(s, ignore_index=True)
Out[90]:
A B C D
0 1.346061 1.511763 1.627081 -0.990582
1 -0.441652 1.211526 0.268520 0.024580
2 -1.577585 0.396823 -0.105381 -0.532532
3 1.453749 1.208843 -0.080952 -0.264610
4 -0.727965 -0.589346 0.339969 -0.693205
5 -0.339355 0.593616 0.884345 1.591431
6 0.141809 0.220390 0.435589 0.192451
7 -0.096701 0.803351 1.715071 -0.708758
8 1.453749 1.208843 -0.080952 -0.264610
In [148]: df.append(s)
Out[148]:
A B C D
0 0.399386 1.322999 -0.607676 0.640081
1 -0.515899 0.404083 -0.273217 -2.463714
2 -0.124882 0.765749 0.005764 -0.520626
3 -2.035809 0.440678 -0.541148 0.006324
4 -2.391258 1.048370 0.975226 -0.893675
5 -0.484241 0.544847 0.563982 -1.272684
6 1.218637 -0.650592 -1.541189 1.369380
7 -0.142225 -0.219917 0.383717 -1.811725
3 -2.035809 0.440678 -0.541148 0.006324
group_by
In [159]: df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],^M
...: ....: 'B': ['one', 'one', 'two', 'three',^M
...: ....: 'two', 'two', 'one', 'three'],^M
...: ....: 'C': np.random.randn(8),^M
...: ....: 'D': np.random.randn(8)})
In [160]: df.groupby('A').mean()
Out[160]:
C D
A
bar -1.075475 -0.001620
foo -0.264180 -0.551601
In [161]: df.groupby(['A', 'B']).sum()
Out[161]:
C D
A B
bar one -1.378534 -0.433647
three -0.859053 0.401976
two -0.988838 0.026811
foo one -1.834255 -1.369606
three 0.222202 -0.535167
two 0.291153 -0.853234
多列分组之后,会生成多层索引