Keyword Search Between Two Dataframes Using Python Pandas
Hi I have two DataFrames like below DF1 Alpha | Numeric | Special and, or | 1,2,3,4,5| @,$,& and DF2 with single column Content | boy or girl | school @ mo
Solution 1:
Solution is s bit complicated, because for multiple match (row 2) need only matched first column df1
:
df1 = pd.DataFrame({'Alpha':['and','or', None, None,None],
'Numeric':['1','2','3','4','5'],
'Special':['@','$','&', None, None]})
print (df1)
Alpha Numeric Special
0and1 @
1or2 $
2None3 &
3None4None4None5None
df2 = pd.DataFrame({'Content':['boy or girl','school @ morn',
'1 school @ morn', 'Pechi']})
print (df2)
Content
0 boy or girl
1 school @ morn
21 school @ morn
3 Pechi
#reshape df1
df1.columns = [np.arange(len(df1.columns)), df1.columns]
df11 = df1.unstack()
.reset_index(level=2,drop=True)
.rename_axis(('col_order','col_name'))
.dropna()
.reset_index(name='val')
print (df11)
col_order col_name val
00 Alpha and10 Alpha or21Numeric131Numeric241Numeric351Numeric461Numeric572 Special @
82 Special $
92 Special &
#split column by whitespaces, reshape
df22 = df2['Content'].str.split(expand=True)
.stack()
.rename('val')
.reset_index(level=1,drop=True)
.rename_axis('idx').reset_index()
print (df22)
idx val
00 boy
10or20 girl
31 school
41 @
51 morn
62172 school
82 @
92 morn
103 Pechi
#left join dataframes, remove non match values by dropna#also for multiple match get always first - use sorting with drop_duplicatesdf = pd.merge(df22, df11, on='val', how='left')
.dropna(subset=['col_name'])
.sort_values(['idx','col_order'])
.drop_duplicates(['idx'])
#if necessary get values from df2#if no value matched add Other categorydf = pd.concat([df2, df.set_index('idx')], axis=1)
.fillna({'col_name':'Other'})[['val','col_name','Content']]
print (df)
val col_name Content
0 or Alpha boy or girl
1 @ Special school @ morn
2 1 Numeric 1 school @ morn
3 NaN Other Pechi
EDIT:
:
df1 = pd.DataFrame({'Alpha':['and','or', None, None,None],
'Numeric':['1','2','3','4','5'],
'Special':['@','$','&', None, None]})
df2 = pd.DataFrame({'Content':['boy OR girl','school @ morn',
'1 school @ morn', 'Pechi']})
#If df1 Alpha values are not lower#df1['Alpha'] = df1['Alpha'].str.lower()
df1.columns = [np.arange(len(df1.columns)), df1.columns]
df11 = (df1.unstack()
.reset_index(level=2,drop=True)
.rename_axis(('col_order','col_name'))
.dropna()
.reset_index(name='val_low'))
df22 = (df2['Content'].str.split(expand=True)
.stack()
.rename('val')
.reset_index(level=1,drop=True)
.rename_axis('idx')
.reset_index())
#convert columns values to lower to new column
df22['val_low'] = df22['val'].str.lower()
df = (pd.merge(df22, df11, on='val_low', how='left')
.dropna(subset=['col_name'])
.sort_values(['idx','col_order'])
.drop_duplicates(['idx']))
df = (pd.concat([df2, df.set_index('idx')], axis=1)
.fillna({'col_name':'Other'})[['val','col_name','Content']])
print (df)
val col_name Content
0 OR Alpha boy OR girl
1 @ Special school @ morn
2 1 Numeric 1 school @ morn
3 NaN Other Pechi
Post a Comment for "Keyword Search Between Two Dataframes Using Python Pandas"