Working with text
> Formatting settings > Detecting Matches
data in Python # Generate an example DataFramed named df
df = pd.DataFrame({"x": [0.123, 4.567, 8.901]})
# Detect if a regex pattern is present in strings with .str.contains()
suits.str.contains("[ae]") # False True True True
# x
# Count the number of matches with .str.count()
# 0 0.123
suits.str.count("[ae]") # 0 1 2 2
# 1 4.567
Learn Python online at www.DataCamp.com # 2 8.901 # Locate the position of substrings with str.find()
suits.str.find("e") # -1 -1 1 4
# Visualize and format table output
df.style.format(precision = 1)
- x The output of style.format
> Extracting matches
Example data used throughout 0 0.1 is an HTML table
>
this cheat sheet 1 4.5 # Extract matches from strings with str.findall()
suits.str.findall(".[ae]") # [] ["ia"] ["he"[ ["pa", "de"]
2 8.9
Throughout this cheat sheet, we’ll be using two pandas series named suits and # Extract capture groups with .str.extractall()
rock_paper_scissors. suits.str.extractall("([ae])(.)")
# 0 1
import pandas as pd
Splitting strings
# match
suits = pd.Series(["clubs", "Diamonds", "hearts", "Spades"])
> # 1 0
# 2 0
a m
e a
rock_paper_scissors = pd.Series(["rock ", " paper", "scissors"]) # 3 0 a d
# Split strings into list of characters with .str.split(pat="")
# 1 e s
suits.str.split(pat="")
# Get subset of strings that match with x[x.str.contains()]
String lengths and substrings
# [, "c" "l" "u" "b" "s", ]
suits[suits.str.contains("d")] # "Diamonds" "Spades"
> # [, "D" "i" "a" "m" "o" "n" "d" "s", ]
# [, "h" "e" "a" "r" "t" "s", ]
# [, "S" "p" "a" "d" "e" "s", ]
# Get the number of characters with .str.len()
suits.str.len() # Returns 5 8 6 6
# Split strings by a separator with .str.split()
suits.str.split(pat = "a")
> Replacing matches
# Get substrings by position with .str[]
# Replace a regex match with another string with .str.replace()
suits.str[2:5] # Returns "ubs" "amo" "art" "ade"
# ["clubs"]
suits.str.replace("a", "4") # "clubs" "Di4monds" "he4rts" "Sp4des"
# ["Di", "monds"]
# Get substrings by negative position with .str[]
# ["he", "rts"]
# Remove a suffix with .str.removesuffix()
suits.str[:-3] # "cl" "Diamo" "hea" "Spa
# ["Sp", "des"]
suits.str.removesuffix # "club" "Diamond" "heart" "Spade"
# Remove whitespace from the start/end with .str.strip()
# Split strings and return DataFrame with .str.split(expand=True)
# Replace a substring with .str.slice_replace()
rock_paper_scissors.str.strip() # "rock" "paper" "scissors"
suits.str.split(pat = "a", expand=True)
rhymes = pd.Series(["vein", "gain", "deign"])
rhymes.str.slice_replace(0, 1, "r") # "rein" "rain" "reign"
# Pad strings to a given length with .str.pad()
# 0 1
suits.str.pad(8, fillchar="_") # "___clubs" "Diamonds" "__hearts" "__Spades" # 0 clubs None
# 1 Di monds
# 2 he rts
# 3 Sp des
> Changing case
# Convert to lowercase with .str.lower()
> Joining or concatenating strings Learn Python Online at
suits.str.lower() # "clubs" "diamonds" "hearts" "spades"
www.DataCamp.com
# Convert to uppercase with .str.upper()
# Combine two strings with +
suits.str.upper() # "CLUBS" "DIAMONDS" "HEARTS" "SPADES"
suits + "5" # "clubs5" "Diamonds5" "hearts5" "Spades5"
# Convert to title case with .str.title()
# Collapse character vector to string with .str.cat()
pd.Series("hello, world!").str.title() # "Hello, World!"
suits.str.cat(sep=", ") # "clubs, Diamonds, hearts, Spades"
# Convert to sentence case with .str.capitalize()
# Duplicate and concatenate strings with *
pd.Series("hello, world!").str.capitalize() # "Hello, world!" suits * 2 # "clubsclubs" "DiamondsDiamonds" "heartshearts" "SpadesSpades"