Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

prepare_data.py 1.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
  1. from typing import Dict, List
  2. import pandas as pd
  3. from datasets import Dataset, load_dataset
  4. def prepare_imdb_data() -> None:
  5. """
  6. Prepare and sample IMDB dataset for sentiment analysis evaluation.
  7. Loads data from HuggingFace, converts to DataFrame, and saves a sample to CSV.
  8. """
  9. # Load the IMDB dataset
  10. print("Loading IMDB dataset...")
  11. imdb: Dataset = load_dataset("imdb") # type: ignore
  12. # Convert labels to more readable format
  13. label_map: Dict[int, str] = {0: "negative", 1: "positive"}
  14. # Create dataframe from test set (we'll use this for zero-shot evaluation)
  15. texts: List[str] = imdb["test"]["text"] # type: ignore
  16. labels: List[int] = imdb["test"]["label"] # type: ignore
  17. eval_df: pd.DataFrame = pd.DataFrame(
  18. {
  19. "text": texts,
  20. "sentiment": [label_map[label] for label in labels],
  21. }
  22. )
  23. # Take a small sample for evaluation
  24. eval_sample: pd.DataFrame = eval_df.sample(n=100, random_state=0)
  25. # Save to CSV file
  26. print("Saving sample to CSV...")
  27. eval_sample.to_csv("imdb_eval_sample.csv", index=False)
  28. print(f"Saved {len(eval_sample)} examples for evaluation")
  29. # Print some statistics
  30. print("\nLabel distribution in evaluation set:")
  31. print(eval_sample["sentiment"].value_counts())
  32. print("\nSample review:")
  33. print("Text:", eval_sample["text"].iloc[0][:200], "...")
  34. print("Sentiment:", eval_sample["sentiment"].iloc[0])
  35. if __name__ == "__main__":
  36. prepare_imdb_data()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...