Merge branch 'main' into ayushmishra/batch-score-oss

Azure · Sep 25, 2024 · 3fc2ddb · 3fc2ddb
2 parents 19ce4c9 + a1b1203
commit 3fc2ddb
Show file tree

Hide file tree

Showing 16 changed files with 120 additions and 0 deletions.
diff --git a/assets/promptflow/evaluators/models/bleu-score-evaluator/asset.yaml b/assets/promptflow/evaluators/models/bleu-score-evaluator/asset.yaml
@@ -0,0 +1,4 @@
+extra_config: model.yaml
+spec: spec.yaml
+type: model
+categories: ["prompt flow evaluator"]
diff --git a/assets/promptflow/evaluators/models/bleu-score-evaluator/description.md b/assets/promptflow/evaluators/models/bleu-score-evaluator/description.md
@@ -0,0 +1,7 @@
+| | |
+| -- | -- |
+| Score range | Float [0-1] |
+| What is this metric? | Measures how closely the generated text matches a reference text based on n-gram overlap. |
+| How does it work? | The BLEU score calculates the geometric mean of the precision of n-grams between the model-generated text and the reference text, with an added brevity penalty for shorter generated text. The precision is computed for unigrams, bigrams, trigrams, etc., depending on the desired BLEU score level. The more n-grams that are shared between the generated and reference texts, the higher the BLEU score. |
+| When to use it? | Use the BLEU score when you want to evaluate the similarity between the generated text and reference text, especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant indicator of quality. |
+| What does it need as input? | Ground Truth Response, Generated Response |
diff --git a/assets/promptflow/evaluators/models/bleu-score-evaluator/model.yaml b/assets/promptflow/evaluators/models/bleu-score-evaluator/model.yaml
@@ -0,0 +1,8 @@
+path:
+  container_name: rai-eval-flows
+  container_path: models/evaluators/BleuScoreEvaluator/v1/evaluator
+  storage_name: amlraipfmodels
+  type: azureblob
+publish:
+  description: description.md
+  type: custom_model
diff --git a/assets/promptflow/evaluators/models/bleu-score-evaluator/spec.yaml b/assets/promptflow/evaluators/models/bleu-score-evaluator/spec.yaml
@@ -0,0 +1,11 @@
+$schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
+name: Bleu-Score-Evaluator
+path: ./
+properties:
+  is-promptflow: true
+  is-evaluator: true
+  show-artifact: true
+  _default-display-file: ./evaluator/_bleu.py
+tags:
+  Preview: ""
+version: 1
diff --git a/assets/promptflow/evaluators/models/gleu-score-evaluator/asset.yaml b/assets/promptflow/evaluators/models/gleu-score-evaluator/asset.yaml
@@ -0,0 +1,4 @@
+extra_config: model.yaml
+spec: spec.yaml
+type: model
+categories: ["prompt flow evaluator"]
diff --git a/assets/promptflow/evaluators/models/gleu-score-evaluator/description.md b/assets/promptflow/evaluators/models/gleu-score-evaluator/description.md
@@ -0,0 +1,7 @@
+| | |
+| -- | -- |
+| Score range | Float [0-1] |
+| What is this metric? | Measures the degree of overlap between the generated text and both the reference text and source text, balancing between precision and recall. |
+| How does it work? | The GLEU score is computed by averaging the precision and recall of n-grams between the generated text and both the reference text and source text. It considers both the overlap of n-grams with the reference (similar to BLEU) and penalizes for over-generation. The score provides a balanced metric, where a value of 1 represents perfect overlap, and 0 represents no overlap. |
+| When to use it? | Use the GLEU score when you want a more balanced evaluation of generated text that considers both the precision and recall of n-gram overlap, especially useful in evaluating machine translation or paraphrasing tasks. |
+| What does it need as input? | Ground Truth Response, Generated Response |
diff --git a/assets/promptflow/evaluators/models/gleu-score-evaluator/model.yaml b/assets/promptflow/evaluators/models/gleu-score-evaluator/model.yaml
@@ -0,0 +1,8 @@
+path:
+  container_name: rai-eval-flows
+  container_path: models/evaluators/GleuScoreEvaluator/v1/evaluator
+  storage_name: amlraipfmodels
+  type: azureblob
+publish:
+  description: description.md
+  type: custom_model
diff --git a/assets/promptflow/evaluators/models/gleu-score-evaluator/spec.yaml b/assets/promptflow/evaluators/models/gleu-score-evaluator/spec.yaml
@@ -0,0 +1,11 @@
+$schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
+name: Gleu-Score-Evaluator
+path: ./
+properties:
+  is-promptflow: true
+  is-evaluator: true
+  show-artifact: true
+  _default-display-file: ./evaluator/_gleu.py
+tags:
+  Preview: ""
+version: 1
diff --git a/assets/promptflow/evaluators/models/meteor-score-evaluator/asset.yaml b/assets/promptflow/evaluators/models/meteor-score-evaluator/asset.yaml
@@ -0,0 +1,4 @@
+extra_config: model.yaml
+spec: spec.yaml
+type: model
+categories: ["prompt flow evaluator"]
diff --git a/assets/promptflow/evaluators/models/meteor-score-evaluator/description.md b/assets/promptflow/evaluators/models/meteor-score-evaluator/description.md
@@ -0,0 +1,7 @@
+| | |
+| -- | -- |
+| Score range | Float [0-1] |
+| What is this metric? | Evaluates the quality of the generated text by considering precision, recall, and a range of linguistic features like synonyms, stemming, and word order. |
+| How does it work? | The METEOR score is calculated based on the harmonic mean of unigram precision and recall, with higher weight given to recall. It also incorporates additional features such as stemming (matching word roots), synonym matching, and a penalty for incorrect word order. The final score ranges from 0 to 1, where 1 indicates a perfect match. |
+| When to use it? | Use the METEOR score when you want a more linguistically informed evaluation metric that captures not only n-gram overlap but also accounts for synonyms, stemming, and word order. This is particularly useful for evaluating tasks like machine translation, text summarization, and text generation. |
+| What does it need as input? | Ground Truth Response, Generated Response |
diff --git a/assets/promptflow/evaluators/models/meteor-score-evaluator/model.yaml b/assets/promptflow/evaluators/models/meteor-score-evaluator/model.yaml
@@ -0,0 +1,8 @@
+path:
+  container_name: rai-eval-flows
+  container_path: models/evaluators/MeteorScoreEvaluator/v1/evaluator
+  storage_name: amlraipfmodels
+  type: azureblob
+publish:
+  description: description.md
+  type: custom_model
diff --git a/assets/promptflow/evaluators/models/meteor-score-evaluator/spec.yaml b/assets/promptflow/evaluators/models/meteor-score-evaluator/spec.yaml
@@ -0,0 +1,11 @@
+$schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
+name: Meteor-Score-Evaluator
+path: ./
+properties:
+  is-promptflow: true
+  is-evaluator: true
+  show-artifact: true
+  _default-display-file: ./evaluator/_meteor.py
+tags:
+  Preview: ""
+version: 1
diff --git a/assets/promptflow/evaluators/models/rouge-score-evaluator/asset.yaml b/assets/promptflow/evaluators/models/rouge-score-evaluator/asset.yaml
@@ -0,0 +1,4 @@
+extra_config: model.yaml
+spec: spec.yaml
+type: model
+categories: ["prompt flow evaluator"]
diff --git a/assets/promptflow/evaluators/models/rouge-score-evaluator/description.md b/assets/promptflow/evaluators/models/rouge-score-evaluator/description.md
@@ -0,0 +1,7 @@
+| | |
+| -- | -- |
+| Score range | Float [0-1] |
+| What is this metric? | Measures the quality of the generated text by comparing it to a reference text using n-gram recall, precision, and F1-score. |
+| How does it work? | The ROUGE score (Recall-Oriented Understudy for Gisting Evaluation) evaluates the similarity between the generated text and reference text based on n-gram overlap, including ROUGE-N (unigram, bigram, etc.), and ROUGE-L (longest common subsequence). It calculates precision, recall, and F1 scores to capture how well the generated text matches the reference text. |
+| When to use it? | Use the ROUGE score when you need a robust evaluation metric for text summarization, machine translation, and other natural language processing tasks, especially when focusing on recall and the ability to capture relevant information from the reference text. |
+| What does it need as input? | Ground Truth Response, Generated Response |
diff --git a/assets/promptflow/evaluators/models/rouge-score-evaluator/model.yaml b/assets/promptflow/evaluators/models/rouge-score-evaluator/model.yaml
@@ -0,0 +1,8 @@
+path:
+  container_name: rai-eval-flows
+  container_path: models/evaluators/RougeScoreEvaluator/v1/evaluator
+  storage_name: amlraipfmodels
+  type: azureblob
+publish:
+  description: description.md
+  type: custom_model
diff --git a/assets/promptflow/evaluators/models/rouge-score-evaluator/spec.yaml b/assets/promptflow/evaluators/models/rouge-score-evaluator/spec.yaml
@@ -0,0 +1,11 @@
+$schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
+name: Rouge-Score-Evaluator
+path: ./
+properties:
+  is-promptflow: true
+  is-evaluator: true
+  show-artifact: true
+  _default-display-file: ./evaluator/_rouge.py
+tags:
+  Preview: ""
+version: 1