Adds init_spec() info to LitMetadata for Models and Datasets.

RyanMullins · LIT team · commit f74798aedf2d · 2023-01-17T11:45:48.000-08:00
Removes superfluous log content from init_spec() impls.

PiperOrigin-RevId: 502646185
diff --git a/lit_nlp/api/dataset.py b/lit_nlp/api/dataset.py
@@ -98,7 +98,7 @@ def description(self) -> str:
     """
     return self._description or inspect.getdoc(self) or ''  # pytype: disable=bad-return-type
 
-  def init_spec(self) -> Optional[Spec]:
+  def init_spec(self) -> Optional[types.Spec]:
     """Attempts to infer a Spec describing a Dataset's constructor parameters.
 
     The Dataset base class attempts to infer a Spec for the constructor using
@@ -119,8 +119,8 @@ def init_spec(self) -> Optional[Spec]:
       spec = types.infer_spec_for_func(self.__init__)
     except TypeError as e:
       spec = None
-      logging.warning("Unable to infer init spec for model '%s'. %s",
-                      self.__class__.__name__, str(e), exc_info=True)
+      logging.warning("Unable to infer init spec for dataset '%s'. %s",
+                      self.__class__.__name__, str(e))
     return spec
 
   def load(self, path: str):
diff --git a/lit_nlp/api/model.py b/lit_nlp/api/model.py
@@ -115,7 +115,7 @@ def init_spec(self) -> Optional[Spec]:
     except TypeError as e:
       spec = None
       logging.warning("Unable to infer init spec for model '%s'. %s",
-                      self.__class__.__name__, str(e), exc_info=True)
+                      self.__class__.__name__, str(e))
     return spec
 
   def is_compatible_with_dataset(self, dataset: lit_dataset.Dataset) -> bool:
diff --git a/lit_nlp/app.py b/lit_nlp/app.py
@@ -60,6 +60,7 @@ def _build_metadata(self):
       info = {
           'description': model.description(),
           'spec': {
+              'initSpec': self._model_init_specs[name],
               'input': model.input_spec(),
               'output': model.output_spec(),
           }
@@ -96,6 +97,7 @@ def _build_metadata(self):
     dataset_info = {}
     for name, ds in self._datasets.items():
       dataset_info[name] = {
+          'init': self._dataset_init_specs[name],
           'spec': ds.spec(),
           'description': ds.description(),
           'size': len(ds),
@@ -285,7 +287,7 @@ def _get_dataset(self,
                    dataset_name: Optional[str] = None,
                    **unused_kw) -> list[IndexedInput]:
     """Attempt to get dataset, or override with a specific path."""
-    return self._datasets[dataset_name].indexed_examples
+    return list(self._datasets[dataset_name].indexed_examples)
 
   def _create_dataset(self,
                       unused_data,
@@ -549,30 +551,38 @@ def __init__(
     # client code to manually merge when this is the desired behavior.
     self._layouts = dict(layout.DEFAULT_LAYOUTS, **(layouts or {}))
 
-    # Wrap models in caching wrapper
-    self._models = {
-        name: caching.CachingModelWrapper(model, name, cache_dir=data_dir)
-        for name, model in models.items()
-    }
-
-    self._datasets: dict[str, lit_dataset.Dataset] = dict(datasets)
-    # TODO(b/202210900): get rid of this, just dynamically create the empty
-    # dataset on the frontend.
-    self._datasets['_union_empty'] = lit_dataset.NoneDataset(self._models)
-
-    self._annotators = annotators or []
-
+    self._model_init_specs: dict[str, Optional[types.Spec]] = {}
+    self._models: dict[str, caching.CachingModelWrapper] = {}
+    for name, model in models.items():
+      # We need to extract and store the results of the original
+      # model.init_spec() here so that we don't lose access to those fields
+      # after LIT wraps the model in a CachingModelWrapper.
+      self._model_init_specs[name] = model.init_spec()
+      # Wrap model in caching wrapper and add it to the app
+      self._models[name] = caching.CachingModelWrapper(model, name,
+                                                       cache_dir=data_dir)
+
+    self._annotators: list[lit_components.Annotator] = annotators or []
     self._saved_datapoints = {}
     self._saved_datapoints_lock = threading.Lock()
 
-    # Run annotation on each dataset, creating an annotated dataset and
-    # replace the datasets with the annotated versions.
-    for ds_key, ds in self._datasets.items():
-      self._datasets[ds_key] = self._run_annotators(ds)
-
-    # Index all datasets
-    self._datasets = lit_dataset.IndexedDataset.index_all(
-        self._datasets, caching.input_hash)
+    tmp_datasets: dict[str, lit_dataset.Dataset] = dict(datasets)
+    # TODO(b/202210900): get rid of this, just dynamically create the empty
+    # dataset on the frontend.
+    tmp_datasets['_union_empty'] = lit_dataset.NoneDataset(self._models)
+
+    self._dataset_init_specs: dict[str, Optional[types.Spec]] = {}
+    self._datasets: dict[str, lit_dataset.IndexedDataset] = {}
+    for name, ds in tmp_datasets.items():
+      # We need to extract and store the results of the original
+      # dataset.init_spec() here so that we don't lose access to those fields
+      # after LIT goes through the dataset annotation and indexing process.
+      self._dataset_init_specs[name] = ds.init_spec()
+      # Anotate the dataset
+      annotated_ds = self._run_annotators(ds)
+      # Index the annotated dataset and add it to the app
+      self._datasets[name] = lit_dataset.IndexedDataset(
+          base=annotated_ds, id_fn=caching.input_hash)
 
     # Generator initialization
     if generators is not None:
diff --git a/lit_nlp/client/lib/testing_utils.ts b/lit_nlp/client/lib/testing_utils.ts
@@ -17,7 +17,7 @@
 
 import 'jasmine';
 
-import {AttentionHeads, BooleanLitType, CategoryLabel, Embeddings, MulticlassPreds, Scalar, TextSegment, TokenGradients, Tokens} from './lit_types';
+import {AttentionHeads, BooleanLitType, CategoryLabel, Embeddings, MulticlassPreds, Scalar, StringLitType, TextSegment, TokenGradients, Tokens} from './lit_types';
 import {LitMetadata, SerializedLitMetadata} from './types';
 import {createLitType} from './utils';
 
@@ -56,6 +56,7 @@ export const mockMetadata: LitMetadata = {
   'models': {
     'sst_0_micro': {
       'spec': {
+        'init': {},
         'input': {
           'passage': createLitType(TextSegment),
           'passage_tokens':
@@ -89,6 +90,7 @@ export const mockMetadata: LitMetadata = {
     },
     'sst_1_micro': {
       'spec': {
+        'init': {},
         'input': {
           'passage': createLitType(TextSegment),
           'passage_tokens':
@@ -123,13 +125,17 @@ export const mockMetadata: LitMetadata = {
   },
   'datasets': {
     'sst_dev': {
+      'initSpec': {
+        'split': createLitType(StringLitType),
+      },
       'size': 872,
       'spec': {
         'passage': createLitType(TextSegment),
         'label': createLitType(CategoryLabel, {'vocab': ['0', '1']}),
       }
     },
     'color_test': {
+      'initSpec': null,
       'size': 2,
       'spec': {
         'testNumFeat0': createLitType(Scalar),
@@ -139,6 +145,7 @@ export const mockMetadata: LitMetadata = {
       }
     },
     'penguin_dev': {
+      'initSpec': {},
       'size': 10,
       'spec': {
         'body_mass_g': createLitType(Scalar, {
@@ -197,6 +204,7 @@ export const mockSerializedMetadata: SerializedLitMetadata = {
   'models': {
     'sst_0_micro': {
       'spec': {
+        'init': {},
         'input': {
           'passage': {'__name__': 'TextSegment', 'required': true},
           'passage_tokens':
@@ -243,6 +251,7 @@ export const mockSerializedMetadata: SerializedLitMetadata = {
     },
     'sst_1_micro': {
       'spec': {
+        'init':  {},
         'input': {
           'passage': {'__name__': 'TextSegment', 'required': true},
           'passage_tokens':
@@ -290,6 +299,9 @@ export const mockSerializedMetadata: SerializedLitMetadata = {
   },
   'datasets': {
     'sst_dev': {
+      'initSpec':  {
+        'split':{'__name__': 'StringLitType', 'required': true}
+      },
       'size': 872,
       'spec': {
         'passage': {'__name__': 'TextSegment', 'required': true},
@@ -298,6 +310,7 @@ export const mockSerializedMetadata: SerializedLitMetadata = {
       }
     },
     'color_test': {
+      'initSpec': null,
       'size': 2,
       'spec': {
         'testNumFeat0': {'__name__': 'Scalar', 'required': true},
@@ -315,6 +328,7 @@ export const mockSerializedMetadata: SerializedLitMetadata = {
       }
     },
     'penguin_dev': {
+      'initSpec': {},
       'size': 10,
       'spec': {
         'body_mass_g': {'__name__': 'Scalar', 'step': 1, 'required': true},
diff --git a/lit_nlp/client/lib/types.ts b/lit_nlp/client/lib/types.ts
@@ -46,6 +46,7 @@ export interface ComponentInfo {
 
 export interface DatasetInfo {
   size: number;
+  initSpec: Spec | null;  // using null here because None ==> null in Python
   spec: Spec;
   description?: string;
 }
@@ -64,6 +65,7 @@ export interface CallConfig {
 }
 
 export interface ModelSpec {
+  init: Spec | null;  // using null here because None ==> null in Python
   input: Spec;
   output: Spec;
 }
diff --git a/lit_nlp/client/lib/utils.ts b/lit_nlp/client/lib/utils.ts
@@ -175,14 +175,17 @@ export function cloneSpec(spec: Spec): Spec {
  */
 export function deserializeLitTypesInLitMetadata(
     metadata: SerializedLitMetadata): LitMetadata {
+
   for (const model of Object.keys(metadata.models)) {
-    metadata.models[model].spec.input =
-        deserializeLitTypesInSpec(metadata.models[model].spec.input);
-    metadata.models[model].spec.output =
-        deserializeLitTypesInSpec(metadata.models[model].spec.output);
+    const {spec} = metadata.models[model];
+    spec.init = spec.init ? deserializeLitTypesInSpec(spec.init) : null;
+    spec.input = deserializeLitTypesInSpec(spec.input);
+    spec.output = deserializeLitTypesInSpec(spec.output);
   }
 
   for (const dataset of Object.keys(metadata.datasets)) {
+    metadata.datasets[dataset].initSpec = metadata.datasets[dataset].initSpec ?
+        deserializeLitTypesInSpec(metadata.datasets[dataset].initSpec) : null;
     metadata.datasets[dataset].spec =
         deserializeLitTypesInSpec(metadata.datasets[dataset].spec);
   }
diff --git a/lit_nlp/client/services/classification_service_test.ts b/lit_nlp/client/services/classification_service_test.ts
@@ -13,6 +13,7 @@ MULTICLASS_PRED_WITH_THRESHOLD.null_idx = 0;
 MULTICLASS_PRED_WITH_THRESHOLD.vocab = ['0', '1'];
 MULTICLASS_PRED_WITH_THRESHOLD.threshold = 0.3;
 const MULTICLASS_SPEC_WITH_THRESHOLD: ModelSpec = {
+  init: null,
   input: {},
   output: {[FIELD_NAME]: MULTICLASS_PRED_WITH_THRESHOLD}
 };
@@ -21,25 +22,29 @@ const MULTICLASS_PRED_WITHOUT_THRESHOLD = new MulticlassPreds();
 MULTICLASS_PRED_WITHOUT_THRESHOLD.null_idx = 0;
 MULTICLASS_PRED_WITHOUT_THRESHOLD.vocab = ['0', '1'];
 const MULTICLASS_SPEC_WITHOUT_THRESHOLD: ModelSpec = {
+  init: null,
   input: {},
   output: {[FIELD_NAME]: MULTICLASS_PRED_WITHOUT_THRESHOLD}
 };
 
 const MULTICLASS_PRED_NO_VOCAB = new MulticlassPreds();
 MULTICLASS_PRED_NO_VOCAB.null_idx = 0;
 const INVALID_SPEC_NO_VOCAB: ModelSpec = {
+  init: null,
   input: {},
   output: {[FIELD_NAME]: MULTICLASS_PRED_NO_VOCAB}
 };
 
 const MULTICLASS_PRED_NO_NULL_IDX = new MulticlassPreds();
 MULTICLASS_PRED_NO_NULL_IDX.vocab = ['0', '1'];
 const INVALID_SPEC_NO_NULL_IDX: ModelSpec = {
+  init: null,
   input: {},
   output: {[FIELD_NAME]: MULTICLASS_PRED_NO_NULL_IDX}
 };
 
 const INVALID_SPEC_NO_MULTICLASS_PRED: ModelSpec = {
+  init: null,
   input: {},
   output: {}
 };

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ export interface ComponentInfo {`
`46`	`46`
`47`	`47`	`export interface DatasetInfo {`
`48`	`48`	`size: number;`
	`49`	`+ initSpec: Spec \| null; // using null here because None ==> null in Python`
`49`	`50`	`spec: Spec;`
`50`	`51`	`description?: string;`
`51`	`52`	`}`
`@@ -64,6 +65,7 @@ export interface CallConfig {`
`64`	`65`	`}`
`65`	`66`
`66`	`67`	`export interface ModelSpec {`
	`68`	`+ init: Spec \| null; // using null here because None ==> null in Python`
`67`	`69`	`input: Spec;`
`68`	`70`	`output: Spec;`
`69`	`71`	`}`