about summary refs log tree commit diff
path: root/pkgs/by-name/cl
diff options
context:
space:
mode:
authorJulien Malka <julien@malka.sh>2024-02-27 13:42:58 +0100
committerGitHub <noreply@github.com>2024-02-27 13:42:58 +0100
commitf52fdedfb0e03ab7308a5d23983670975ddcba0e (patch)
tree811332d342d5e234b39ff369eabd3508045e30bf /pkgs/by-name/cl
parent661d1becc7fdb64894cb4741528f574587537f2f (diff)
parent202dd43c67a840fcf460567fcd748341e7ba71bf (diff)
Merge pull request #291173 from chvp/upd/mu
Diffstat (limited to 'pkgs/by-name/cl')
-rw-r--r--pkgs/by-name/cl/cld2/package.nix50
1 files changed, 50 insertions, 0 deletions
diff --git a/pkgs/by-name/cl/cld2/package.nix b/pkgs/by-name/cl/cld2/package.nix
new file mode 100644
index 0000000000000..bf28b160f2bab
--- /dev/null
+++ b/pkgs/by-name/cl/cld2/package.nix
@@ -0,0 +1,50 @@
+{ lib
+, stdenv
+, fetchFromGitHub
+, cmake
+, fetchpatch
+}:
+
+stdenv.mkDerivation {
+  pname = "cld2";
+  version = "unstable-2015-08-21";
+
+  src = fetchFromGitHub {
+    owner = "CLD2Owners";
+    repo = "cld2";
+    rev = "b56fa78a2fe44ac2851bae5bf4f4693a0644da7b";
+    hash = "sha256-YhXs45IbriKWKULguZM4DgfV/Fzr73VHxA1pFTXCyv8=";
+  };
+
+  patches = [
+    (fetchpatch {
+      name = "add-cmakelists.txt";
+      url = "https://github.com/CLD2Owners/cld2/pull/65/commits/9cfac02c2ac7802ab7079560b38a474473c45f51.patch";
+      hash = "sha256-uOjmUk8kMFl+wED44ErXoLRyblhgDwFx9K1Wj65Omh8=";
+    })
+  ];
+
+  nativeBuildInputs = [ cmake ];
+
+  meta = with lib; {
+    homepage = "https://github.com/CLD2Owners/cld2";
+    description = "Compact Language Detector 2";
+    longDescription = ''
+      CLD2 probabilistically detects over 80 languages in Unicode UTF-8 text,
+      either plain text or HTML/XML. Legacy encodings must be converted to valid
+      UTF-8 by the caller. For mixed-language input, CLD2 returns the top three
+      languages found and their approximate percentages of the total text bytes
+      (e.g. 80% English and 20% French out of 1000 bytes of text means about 800
+      bytes of English and 200 bytes of French). Optionally, it also returns a
+      vector of text spans with the language of each identified. This may be
+      useful for applying different spelling-correction dictionaries or
+      different machine translation requests to each span. The design target is
+      web pages of at least 200 characters (about two sentences); CLD2 is not
+      designed to do well on very short text, lists of proper names, part
+      numbers, etc.
+    '';
+    license = licenses.asl20;
+    maintainers = with maintainers; [ chvp ];
+    platforms = platforms.all;
+  };
+}