about summary refs log tree commit diff
path: root/pkgs/development/python-modules/paddleocr/default.nix
blob: 8033d8c79e6df72419aef954bb32300f837b0540 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
{
  lib,
  buildPythonPackage,
  fetchFromGitHub,
  attrdict,
  beautifulsoup4,
  cython,
  fire,
  fonttools,
  lmdb,
  lxml,
  numpy,
  opencv4,
  openpyxl,
  pdf2docx,
  pillow,
  premailer,
  pyclipper,
  pymupdf,
  python-docx,
  rapidfuzz,
  scikit-image,
  shapely,
  tqdm,
  paddlepaddle,
  lanms-neo,
  polygon3,
}:

let
  version = "2.7.5";
in
buildPythonPackage {
  pname = "paddleocr";
  inherit version;
  format = "setuptools";

  src = fetchFromGitHub {
    owner = "PaddlePaddle";
    repo = "PaddleOCR";
    rev = "refs/tags/v${version}";
    hash = "sha256-8mnSV4ga6G2cbYCX84XJRFiLCoXstTAtqvg9QqVN6GI=";
  };

  patches = [
    # The `ppocr.data.imaug` re-exports the `IaaAugment` and `CopyPaste`
    # classes. These classes depend on the `imgaug` package which is
    # unmaintained and has been removed from nixpkgs.
    #
    # The image OCR feature of PaddleOCR doesn't use these classes though, so
    # they work even after stripping the the `IaaAugment` and `CopyPaste`
    # exports. It probably breaks some of the OCR model creation tooling that
    # PaddleOCR provides, however.
    ./remove-import-imaug.patch
  ];

  # trying to relax only pymupdf makes the whole build fail
  pythonRelaxDeps = true;
  pythonRemoveDeps = [
    "imgaug"
    "visualdl"
    "opencv-python"
    "opencv-contrib-python"
  ];

  propagatedBuildInputs = [
    attrdict
    beautifulsoup4
    cython
    fire
    fonttools
    lmdb
    lxml
    numpy
    opencv4
    openpyxl
    pdf2docx
    pillow
    premailer
    pyclipper
    pymupdf
    python-docx
    rapidfuzz
    scikit-image
    shapely
    tqdm
    paddlepaddle
    lanms-neo
    polygon3
  ];

  # TODO: The tests depend, among possibly other things, on `cudatoolkit`.
  # But Cudatoolkit fails to install.
  # preCheck = "export HOME=$TMPDIR";
  # nativeCheckInputs = with pkgs; [ which cudatoolkit ];
  doCheck = false;

  meta = with lib; {
    homepage = "https://github.com/PaddlePaddle/PaddleOCR";
    license = licenses.asl20;
    description = "Multilingual OCR toolkits based on PaddlePaddle";
    longDescription = ''
      PaddleOCR aims to create multilingual, awesome, leading, and practical OCR
      tools that help users train better models and apply them into practice.
    '';
    changelog = "https://github.com/PaddlePaddle/PaddleOCR/releases/tag/v${version}";
    maintainers = with maintainers; [ happysalada ];
    platforms = [
      "x86_64-linux"
      "x86_64-darwin"
      "aarch64-darwin"
    ];
  };
}