DagsHub-Science
/
seamless_communication
mirror of https://github.com/facebookresearch/seamless_communication


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
129

	
130

	
131

	
132

	
133

	
134

	
135

	
136

	
137

	
138

	
139

	
140

	
            # Copyright (c) Meta Platforms, Inc. and affiliates
# All rights reserved.
#
# This source code is licensed under the license found in the
# MIT_LICENSE file in the root directory of this source tree.

import argparse

import torch
from tqdm import tqdm
from pathlib import Path

from sonar.inference_pipelines.speech import (
    SpeechInferenceParams,
)
from seamless_communication.toxicity.mutox.speech_pipeline import (
    MutoxSpeechClassifierPipeline,
)

import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
)

logger = logging.getLogger(__name__)


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Mutox speech will compute a toxicity score for each speech segment it is provided."
    )
    parser.add_argument(
        "data_file",
        type=Path,
        help="Path to the input TSV manifest that list the audio files.",
    )
    parser.add_argument(
        "output_file",
        type=Path,
        help="Path to a TSV file where to save the results.",
    )
    parser.add_argument(
        "--lang",
        type=str,
        help="Language, language of the speech being passed as input, three letter code",
        required=True,
    )
    parser.add_argument(
        "--audio_root_dir",
        type=str,
        help="Root directory for the audio filenames in the data file.",
    )
    parser.add_argument(
        "--audio_path_index",
        type=int,
        help="Index of the column where the audiofile is listed in the input tsv.",
        default="audio",
    )
    parser.add_argument(
        "--batch_size",
        type=int,
        help="Inference batch size.",
        default=4,
    )
    parser.add_argument(
        "--n_parallel",
        type=int,
        help="Number of data loading in parallel.",
        default=4,
    )
    parser.add_argument(
        "--device",
        type=str,
        help="name of the device to use with torch.",
        required=False,
    )
    args, _unknown = parser.parse_known_args()

    if args.device is not None:
        device = torch.device(args.device)
        dtype = torch.float32
        if device.type == "cuda":
            dtype = torch.float16
    elif torch.cuda.is_available():
        device = torch.device("cuda:0")
        dtype = torch.float16
        logger.info("using cuda:0, %s", dtype)
    else:
        device = torch.device("cpu")
        dtype = torch.float32
        logger.info("no gpu, using cpu")

    logger.info("loading models.")

    pipeline_builder = MutoxSpeechClassifierPipeline.load_model_from_name(
        mutox_classifier_name="mutox",
        encoder_name=f"sonar_speech_encoder_{args.lang}",
        device=device,
    )

    pipeline = pipeline_builder.build_pipeline(
        SpeechInferenceParams(
            data_file=args.data_file,
            audio_root_dir=args.audio_root_dir,
            audio_path_index=args.audio_path_index,
            target_lang=args.lang,
            batch_size=args.batch_size,
            pad_idx=0,
            device=device,
            fbank_dtype=torch.float32,
            n_parallel=args.n_parallel,
        )
    )

    logger.info("processing.")

    with open(args.output_file, "w", encoding="utf-8") as outf:
        print(
            "input_audio_path",
            "score",
            sep="\t",
            file=outf,
        )
        for example in tqdm(pipeline):
            ex = example["audio"]
            for idx, path in enumerate(ex["path"]):
                print(
                    str(path),
                    ex["data"][idx].item(),
                    sep="\t",
                    file=outf,
                )

    logger.info(f"Done, outputs are in {args.output_file}.")


if __name__ == "__main__":
    main()