refactor: optimization, no need to annotate entire dataset
This commit is contained in:
@@ -56,17 +56,25 @@ def process(df: pd.DataFrame) -> pd.DataFrame:
|
|||||||
df['words'] = df['name'].str.count(' ') + 1
|
df['words'] = df['name'].str.count(' ') + 1
|
||||||
df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
|
df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
|
||||||
|
|
||||||
|
# Calculate probable_native and probable_surname
|
||||||
name_split = df['name'].str.split()
|
name_split = df['name'].str.split()
|
||||||
df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
|
df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
|
||||||
df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '')
|
df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '')
|
||||||
df['identified_category'] = df['words'].apply(lambda x: 'compose' if x > 3 else 'simple')
|
df['identified_category'] = df['words'].apply(lambda x: 'compose' if x > 3 else 'simple')
|
||||||
df['identified_name'] = None
|
df['identified_name'] = None
|
||||||
df['identified_surname'] = None
|
df['identified_surname'] = None
|
||||||
|
df['annotated'] = 0
|
||||||
|
|
||||||
|
# We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname
|
||||||
|
# This is a common pattern in Congolese names
|
||||||
|
three_word_mask = df['words'] == 3
|
||||||
|
df.loc[three_word_mask, 'identified_name'] = df.loc[three_word_mask, 'probable_native']
|
||||||
|
df.loc[three_word_mask, 'identified_surname'] = df.loc[three_word_mask, 'probable_surname']
|
||||||
|
df.loc[three_word_mask, 'annotated'] = 1
|
||||||
|
|
||||||
logging.info("Mapping regions to provinces")
|
logging.info("Mapping regions to provinces")
|
||||||
df['province'] = df['region'].map(lambda r: REGION_MAPPING.get(r, ('AUTRES', 'AUTRES'))[1])
|
df['province'] = df['region'].map(lambda r: REGION_MAPPING.get(r, ('AUTRES', 'AUTRES'))[1])
|
||||||
df['province'] = df['province'].str.lower()
|
df['province'] = df['province'].str.lower()
|
||||||
df['annotated'] = 0
|
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|||||||
+17
-5
@@ -4,16 +4,28 @@ Return null if a part cannot be identified. Do not alter the original name or ad
|
|||||||
|
|
||||||
## Examples:
|
## Examples:
|
||||||
```
|
```
|
||||||
"tshabu ngandu bernard"
|
"tshabu ngandu"
|
||||||
{
|
{
|
||||||
"identified_name": "tshabu ngandu",
|
"identified_name": "tshabu ngandu",
|
||||||
"identified_surname": "bernard"
|
"identified_surname": null
|
||||||
}
|
}
|
||||||
|
|
||||||
"tshisekedi wa mulumba"
|
"bapite marie"
|
||||||
{
|
{
|
||||||
"identified_name": "tshisekedi wa mulumba",
|
"identified_name": "bapite",
|
||||||
"identified_surname": null
|
"identified_surname": "marie"
|
||||||
|
}
|
||||||
|
|
||||||
|
"tshisekedi mulumba jean claude"
|
||||||
|
{
|
||||||
|
"identified_name": "tshisekedi mulumba",
|
||||||
|
"identified_surname": "jean claude"
|
||||||
|
}
|
||||||
|
|
||||||
|
"ilunga wa makuta jean-marie"
|
||||||
|
{
|
||||||
|
"identified_name": "ilunga wa makuta",
|
||||||
|
"identified_surname": "jean-marie"
|
||||||
}
|
}
|
||||||
|
|
||||||
"ntumba wasokadio marie france"
|
"ntumba wasokadio marie france"
|
||||||
|
|||||||
Reference in New Issue
Block a user