import re
from typing import List
def preprocess_clinical_text(text: str) -> str:
"""Basic preprocessing for clinical notes."""
# Lowercase (optional - may lose meaning like "CA" for cancer)
text = text.lower()
# Expand common abbreviations
abbrev_map = {
r'\bpt\b': 'patient',
r'\bhx\b': 'history',
r'\bdx\b': 'diagnosis',
r'\btx\b': 'treatment',
r'\brx\b': 'prescription',
r'\bsx\b': 'symptoms',
r'\byo\b': 'year old',
r'\bw/\b': 'with',
r'\bw/o\b': 'without',
r'\bs/p\b': 'status post',
}
for pattern, replacement in abbrev_map.items():
text = re.sub(pattern, replacement, text)
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
# Example
note = "45 yo pt w/ hx of DM presents w/ SOB"
print(preprocess_clinical_text(note))
# Output: "45 year old patient with history of dm presents with sob"14 Text & EHR NLP Applications
Electronic health records contain rich clinical narratives—progress notes, discharge summaries, radiology reports—that encode diagnostic reasoning beyond structured data fields. This chapter covers NLP techniques for extracting, summarizing, and reasoning over clinical text.
14.1 Clinical Text Preprocessing
Clinical Context: Clinical notes are messy: abbreviations (“pt” for patient, “hx” for history), misspellings, copy-pasted templates, and institution-specific jargon. Preprocessing pipelines must handle this variability while preserving clinical meaning.
14.1.1 Challenges in Clinical Text
Clinical text differs from general English:
- Abbreviations: “SOB” means shortness of breath, not an insult
- Negation: “No fever” is the opposite of “fever”
- Uncertainty: “possible pneumonia” differs from “pneumonia”
- Temporality: “history of MI” differs from “acute MI”
- Section structure: Assessment differs from Plan differs from History
14.1.2 Basic Preprocessing Pipeline
14.1.3 Section Segmentation
Clinical notes have structure. Extracting sections enables targeted analysis:
def extract_sections(note: str) -> dict:
"""Extract common sections from clinical note."""
sections = {}
# Common section headers
patterns = {
'chief_complaint': r'(?:chief complaint|cc)[:\s]*(.+?)(?=\n[A-Z]|\Z)',
'history': r'(?:history of present illness|hpi)[:\s]*(.+?)(?=\n[A-Z]|\Z)',
'assessment': r'(?:assessment|impression)[:\s]*(.+?)(?=\n[A-Z]|\Z)',
'plan': r'(?:plan)[:\s]*(.+?)(?=\n[A-Z]|\Z)',
}
for section_name, pattern in patterns.items():
match = re.search(pattern, note, re.IGNORECASE | re.DOTALL)
if match:
sections[section_name] = match.group(1).strip()
return sections14.2 Named Entity Recognition with scispaCy
Clinical Context: Extracting structured information from notes—medications, diagnoses, procedures—enables downstream analysis. Named entity recognition (NER) identifies and classifies these mentions automatically.
14.2.1 scispaCy for Biomedical NER
scispaCy provides pretrained models for biomedical and clinical text:
import scispacy
import spacy
from scispacy.linking import EntityLinker
# Load clinical NER model
nlp = spacy.load("en_ner_bc5cdr_md") # Diseases and chemicals
# Add UMLS entity linker
nlp.add_pipe("scispacy_linker", config={
"resolve_abbreviations": True,
"linker_name": "umls"
})
# Process clinical text
text = """Patient presents with Type 2 diabetes mellitus and
hypertension. Started on metformin 500mg twice daily."""
doc = nlp(text)
# Extract entities
for ent in doc.ents:
print(f"{ent.text:20} | {ent.label_:10} | {ent.start_char}-{ent.end_char}")
# Output:
# Type 2 diabetes mellitus | DISEASE | 22-46
# hypertension | DISEASE | 51-63
# metformin | CHEMICAL | 76-8514.2.2 UMLS Concept Linking
The Unified Medical Language System (UMLS) provides standardized concept identifiers (CUIs). Linking entities to UMLS enables:
- Normalization: “heart attack” and “MI” map to same concept
- Hierarchy: Navigate broader/narrower relationships
- Cross-referencing: Link to ICD codes, RxNorm, SNOMED-CT
# Get UMLS concepts for entities
for ent in doc.ents:
if ent._.kb_ents: # UMLS links available
cui, score = ent._.kb_ents[0] # Top match
linker = nlp.get_pipe("scispacy_linker")
concept = linker.kb.cui_to_entity[cui]
print(f"{ent.text}")
print(f" CUI: {cui}")
print(f" Name: {concept.canonical_name}")
print(f" Definition: {concept.definition[:100]}...")
print()14.2.3 Custom NER for Clinical Entities
For institution-specific entities (local medication names, procedure codes), train custom models:
import spacy
from spacy.training import Example
# Create blank model
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
# Add custom labels
ner.add_label("MEDICATION")
ner.add_label("DOSAGE")
ner.add_label("FREQUENCY")
# Training data: (text, {"entities": [(start, end, label), ...]})
train_data = [
("Take lisinopril 10mg daily", {
"entities": [(5, 15, "MEDICATION"), (16, 20, "DOSAGE"),
(21, 26, "FREQUENCY")]
}),
("Metformin 500mg twice daily with meals", {
"entities": [(0, 9, "MEDICATION"), (10, 15, "DOSAGE"),
(16, 27, "FREQUENCY")]
}),
]
# Training loop (simplified)
optimizer = nlp.begin_training()
for epoch in range(30):
losses = {}
for text, annotations in train_data:
example = Example.from_dict(nlp.make_doc(text), annotations)
nlp.update([example], losses=losses)
print(f"Epoch {epoch}: {losses}")14.3 Clinical Summarization
Clinical Context: A hospitalist reviewing a complex patient’s chart faces hundreds of pages of notes. AI summarization can distill key information—active problems, recent changes, pending actions—saving time and reducing cognitive load.
14.3.1 Extractive vs. Abstractive Summarization
- Extractive: Selects important sentences from the original text. Safer but may miss connections.
- Abstractive: Generates new text summarizing the content. More fluent but risk of hallucination.
For clinical use, extractive methods or constrained abstractive approaches are safer.
14.3.2 LLM-Based Summarization
Large language models can generate fluent summaries but require careful prompting:
from openai import OpenAI
client = OpenAI()
def summarize_discharge(note: str) -> str:
"""Summarize discharge summary using LLM."""
prompt = f"""Summarize this discharge summary in 3-5 bullet points.
Include: primary diagnosis, key procedures, discharge medications,
and follow-up instructions. Do not add information not present
in the original note.
DISCHARGE SUMMARY:
{note}
SUMMARY:"""
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a clinical summarization assistant. Only include information explicitly stated in the provided text."},
{"role": "user", "content": prompt}
],
temperature=0.3, # Lower temperature for factual tasks
max_tokens=500
)
return response.choices[0].message.content
# For local/private deployment, use Ollama with Llama
from ollama import chat
def summarize_local(note: str) -> str:
"""Summarize using local LLM (no data leaves institution)."""
response = chat(
model='llama3',
messages=[{
'role': 'user',
'content': f"Summarize this clinical note in 3 bullet points:\n\n{note}"
}]
)
return response['message']['content']14.3.3 Structured Extraction
For specific fields, structured extraction is more reliable than free-form summarization:
from pydantic import BaseModel
from typing import List, Optional
import json
class MedicationExtraction(BaseModel):
name: str
dose: Optional[str]
frequency: Optional[str]
route: Optional[str]
class DischargeExtraction(BaseModel):
primary_diagnosis: str
secondary_diagnoses: List[str]
medications: List[MedicationExtraction]
follow_up_days: Optional[int]
def extract_structured(note: str) -> DischargeExtraction:
"""Extract structured fields from discharge note."""
prompt = f"""Extract the following from this discharge summary.
Return as JSON matching the schema.
{note}"""
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0
)
data = json.loads(response.choices[0].message.content)
return DischargeExtraction(**data)14.4 Hallucination Detection and Mitigation
Clinical Context: An LLM that invents a medication allergy or fabricates a lab value could cause patient harm. Hallucination—generating plausible but false information—is a critical risk in clinical NLP.
14.4.1 Types of Hallucinations
- Intrinsic: Contradicts the source document (“patient denies chest pain” summarized as “chest pain present”)
- Extrinsic: Adds information not in the source (inventing a medication)
- Factual: States incorrect medical facts (wrong drug interactions)
14.4.2 Detection Strategies
1. Entailment checking: Verify each output claim is supported by input
from transformers import pipeline
# NLI model to check if output is entailed by input
nli = pipeline("text-classification",
model="microsoft/deberta-large-mnli")
def check_entailment(source: str, claim: str) -> dict:
"""Check if claim is supported by source text."""
result = nli(f"{source} [SEP] {claim}")
return {
"label": result[0]["label"],
"score": result[0]["score"]
}
# Example
source = "Patient has Type 2 diabetes controlled with metformin."
claim = "Patient takes insulin for diabetes."
result = check_entailment(source, claim)
# Returns: {"label": "CONTRADICTION", "score": 0.94}2. Source grounding: Require citations to original text
def summarize_with_citations(note: str) -> str:
prompt = f"""Summarize this note. For each claim, cite the exact
quote from the original text in [brackets].
Example format:
- Patient has diabetes [Type 2 DM noted in problem list]
NOTE:
{note}"""
# ... LLM call3. Confidence thresholding: Flag low-confidence outputs for human review
14.4.3 Mitigation Strategies
- Constrained generation: Limit outputs to predefined options
- Retrieval augmentation: Ground responses in retrieved documents
- Human-in-the-loop: Require clinician verification before use
- Temperature 0: Use deterministic decoding for factual tasks
- Fine-tuning: Train on clinical data with factual consistency rewards
14.5 Comparing Approaches: scispaCy vs. LLMs
Clinical Context: HW6 asks you to compare rule-based/scispaCy extraction with LLM-based approaches. Each has trade-offs.
14.5.1 scispaCy Strengths
- Deterministic: Same input always produces same output
- Interpretable: Can inspect which rules/patterns matched
- Fast: Processes thousands of notes per second
- Private: Runs locally, no data leaves institution
- Validated: Pretrained models have published benchmarks
14.5.2 LLM Strengths
- Flexible: Handles novel phrasings without retraining
- Contextual: Understands nuance, negation, uncertainty
- Multi-task: One model for extraction, summarization, Q&A
- Few-shot: Adapts to new tasks with examples in prompt
14.5.3 Cost-Benefit Analysis
import time
import numpy as np
def benchmark_approaches(notes: list, n_runs: int = 3):
"""Compare scispaCy vs LLM for entity extraction."""
results = {"scispacy": [], "llm": []}
# scispaCy benchmark
nlp = spacy.load("en_ner_bc5cdr_md")
for _ in range(n_runs):
start = time.time()
for note in notes:
doc = nlp(note)
entities = [(e.text, e.label_) for e in doc.ents]
results["scispacy"].append(time.time() - start)
# LLM benchmark (simplified)
for _ in range(n_runs):
start = time.time()
for note in notes:
# API call to extract entities
response = extract_with_llm(note)
results["llm"].append(time.time() - start)
print(f"scispaCy: {np.mean(results['scispacy']):.2f}s "
f"(${0:.4f})")
print(f"LLM: {np.mean(results['llm']):.2f}s "
f"(${len(notes) * 0.01:.2f})") # Estimated API cost14.5.4 Recommendation
For HW6 and clinical deployment:
- Use scispaCy for high-volume, well-defined extraction tasks
- Use LLMs for complex reasoning, summarization, or novel tasks
- Combine both: scispaCy for entity extraction, LLM for relationship extraction
- Always validate outputs against ground truth before deployment
14.6 Integration with Structured EHR Data
Clinical Context: The richest predictions combine narrative text with structured data—lab values, vital signs, billing codes. Multi-modal EHR models outperform text-only or structured-only approaches.
14.6.1 Feature Engineering from Text
Convert extracted entities to features:
import pandas as pd
from collections import Counter
def text_to_features(notes: list, nlp) -> pd.DataFrame:
"""Convert clinical notes to structured features."""
features = []
for note in notes:
doc = nlp(note)
# Count entity types
entity_counts = Counter(ent.label_ for ent in doc.ents)
# Extract specific mentions
has_diabetes = any("diabet" in ent.text.lower()
for ent in doc.ents)
has_hypertension = any("hypertens" in ent.text.lower()
for ent in doc.ents)
features.append({
"n_diseases": entity_counts.get("DISEASE", 0),
"n_medications": entity_counts.get("CHEMICAL", 0),
"has_diabetes": int(has_diabetes),
"has_hypertension": int(has_hypertension),
"note_length": len(note),
})
return pd.DataFrame(features)
# Combine with structured data
text_features = text_to_features(patient_notes, nlp)
combined = pd.concat([structured_data, text_features], axis=1)14.6.2 Clinical BERT Embeddings
For deep learning, use pretrained clinical language models:
from transformers import AutoTokenizer, AutoModel
import torch
# Load ClinicalBERT
tokenizer = AutoTokenizer.from_pretrained(
"emilyalsentzer/Bio_ClinicalBERT"
)
model = AutoModel.from_pretrained(
"emilyalsentzer/Bio_ClinicalBERT"
)
def get_note_embedding(note: str) -> torch.Tensor:
"""Get ClinicalBERT embedding for a note."""
inputs = tokenizer(note, return_tensors="pt",
truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
# Use [CLS] token embedding
embedding = outputs.last_hidden_state[:, 0, :]
return embedding.squeeze()
# Get embeddings for all notes
embeddings = torch.stack([get_note_embedding(n) for n in notes])
# Use as features in downstream model
combined_features = torch.cat([
structured_tensor,
embeddings
], dim=1)These embeddings capture semantic meaning and can be combined with structured features for prediction tasks like readmission risk or mortality prediction.