Add support for word-level audio transcription timestamp granularity (#733)
* Add support for audio transcription timestamp_granularities word * Fixup multiple timestamp granularities
This commit is contained in:
27
audio.go
27
audio.go
@@ -27,8 +27,14 @@ const (
|
|||||||
AudioResponseFormatVTT AudioResponseFormat = "vtt"
|
AudioResponseFormatVTT AudioResponseFormat = "vtt"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type TranscriptionTimestampGranularity string
|
||||||
|
|
||||||
|
const (
|
||||||
|
TranscriptionTimestampGranularityWord TranscriptionTimestampGranularity = "word"
|
||||||
|
TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
|
||||||
|
)
|
||||||
|
|
||||||
// AudioRequest represents a request structure for audio API.
|
// AudioRequest represents a request structure for audio API.
|
||||||
// ResponseFormat is not supported for now. We only return JSON text, which may be sufficient.
|
|
||||||
type AudioRequest struct {
|
type AudioRequest struct {
|
||||||
Model string
|
Model string
|
||||||
|
|
||||||
@@ -38,10 +44,11 @@ type AudioRequest struct {
|
|||||||
// Reader is an optional io.Reader when you do not want to use an existing file.
|
// Reader is an optional io.Reader when you do not want to use an existing file.
|
||||||
Reader io.Reader
|
Reader io.Reader
|
||||||
|
|
||||||
Prompt string // For translation, it should be in English
|
Prompt string
|
||||||
Temperature float32
|
Temperature float32
|
||||||
Language string // For translation, just do not use it. It seems "en" works, not confirmed...
|
Language string // Only for transcription.
|
||||||
Format AudioResponseFormat
|
Format AudioResponseFormat
|
||||||
|
TimestampGranularities []TranscriptionTimestampGranularity // Only for transcription.
|
||||||
}
|
}
|
||||||
|
|
||||||
// AudioResponse represents a response structure for audio API.
|
// AudioResponse represents a response structure for audio API.
|
||||||
@@ -62,6 +69,11 @@ type AudioResponse struct {
|
|||||||
NoSpeechProb float64 `json:"no_speech_prob"`
|
NoSpeechProb float64 `json:"no_speech_prob"`
|
||||||
Transient bool `json:"transient"`
|
Transient bool `json:"transient"`
|
||||||
} `json:"segments"`
|
} `json:"segments"`
|
||||||
|
Words []struct {
|
||||||
|
Word string `json:"word"`
|
||||||
|
Start float64 `json:"start"`
|
||||||
|
End float64 `json:"end"`
|
||||||
|
} `json:"words"`
|
||||||
Text string `json:"text"`
|
Text string `json:"text"`
|
||||||
|
|
||||||
httpHeader
|
httpHeader
|
||||||
@@ -179,6 +191,15 @@ func audioMultipartForm(request AudioRequest, b utils.FormBuilder) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(request.TimestampGranularities) > 0 {
|
||||||
|
for _, tg := range request.TimestampGranularities {
|
||||||
|
err = b.WriteField("timestamp_granularities[]", string(tg))
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("writing timestamp_granularities[]: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Close the multipart writer
|
// Close the multipart writer
|
||||||
return b.Close()
|
return b.Close()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -105,6 +105,10 @@ func TestAudioWithOptionalArgs(t *testing.T) {
|
|||||||
Temperature: 0.5,
|
Temperature: 0.5,
|
||||||
Language: "zh",
|
Language: "zh",
|
||||||
Format: openai.AudioResponseFormatSRT,
|
Format: openai.AudioResponseFormatSRT,
|
||||||
|
TimestampGranularities: []openai.TranscriptionTimestampGranularity{
|
||||||
|
openai.TranscriptionTimestampGranularitySegment,
|
||||||
|
openai.TranscriptionTimestampGranularityWord,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
_, err := tc.createFn(ctx, req)
|
_, err := tc.createFn(ctx, req)
|
||||||
checks.NoError(t, err, "audio API error")
|
checks.NoError(t, err, "audio API error")
|
||||||
|
|||||||
@@ -24,6 +24,10 @@ func TestAudioWithFailingFormBuilder(t *testing.T) {
|
|||||||
Temperature: 0.5,
|
Temperature: 0.5,
|
||||||
Language: "en",
|
Language: "en",
|
||||||
Format: AudioResponseFormatSRT,
|
Format: AudioResponseFormatSRT,
|
||||||
|
TimestampGranularities: []TranscriptionTimestampGranularity{
|
||||||
|
TranscriptionTimestampGranularitySegment,
|
||||||
|
TranscriptionTimestampGranularityWord,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
mockFailedErr := fmt.Errorf("mock form builder fail")
|
mockFailedErr := fmt.Errorf("mock form builder fail")
|
||||||
@@ -47,7 +51,7 @@ func TestAudioWithFailingFormBuilder(t *testing.T) {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
failOn := []string{"model", "prompt", "temperature", "language", "response_format"}
|
failOn := []string{"model", "prompt", "temperature", "language", "response_format", "timestamp_granularities[]"}
|
||||||
for _, failingField := range failOn {
|
for _, failingField := range failOn {
|
||||||
failForField = failingField
|
failForField = failingField
|
||||||
mockFailedErr = fmt.Errorf("mock form builder fail on field %s", failingField)
|
mockFailedErr = fmt.Errorf("mock form builder fail on field %s", failingField)
|
||||||
|
|||||||
Reference in New Issue
Block a user